]> de.git.xonotic.org Git - xonotic/darkplaces.git/blob - dpsoftrast.c
clipplane validation fix
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "thread.h"
7 #include "dpsoftrast.h"
8
9 #ifdef _MSC_VER
10 #pragma warning(disable : 4324)
11 #endif
12
13 #ifndef __cplusplus
14 typedef qboolean bool;
15 #endif
16
17 #define ALIGN_SIZE 16
18 #define ATOMIC_SIZE 32
19
20 #ifdef SSE_POSSIBLE
21         #if defined(__APPLE__)
22                 #include <libkern/OSAtomic.h>
23                 #define ALIGN(var) var __attribute__((__aligned__(16)))
24                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
25                 #define MEMORY_BARRIER (_mm_sfence())
26                 #define ATOMIC_COUNTER volatile int32_t 
27                 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28                 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29                 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30         #elif defined(__GNUC__)
31                 #define ALIGN(var) var __attribute__((__aligned__(16)))
32                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
33                 #define MEMORY_BARRIER (_mm_sfence())
34                 //(__sync_synchronize())
35                 #define ATOMIC_COUNTER volatile int
36                 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
37                 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
38                 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
39         #elif defined(_MSC_VER)
40                 #define ALIGN(var) __declspec(align(16)) var
41                 #define ATOMIC(var) __declspec(align(32)) var
42                 #define MEMORY_BARRIER (_mm_sfence())
43                 //(MemoryBarrier())
44                 #define ATOMIC_COUNTER volatile LONG
45                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
46                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
47                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
48         #endif
49 #endif
50
51 #ifndef ALIGN
52 #define ALIGN(var) var
53 #endif
54 #ifndef ATOMIC
55 #define ATOMIC(var) var
56 #endif
57 #ifndef MEMORY_BARRIER
58 #define MEMORY_BARRIER ((void)0)
59 #endif
60 #ifndef ATOMIC_COUNTER
61 #define ATOMIC_COUNTER int
62 #endif
63 #ifndef ATOMIC_INCREMENT
64 #define ATOMIC_INCREMENT(counter) (++(counter))
65 #endif
66 #ifndef ATOMIC_DECREMENT
67 #define ATOMIC_DECREMENT(counter) (--(counter))
68 #endif
69 #ifndef ATOMIC_ADD
70 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
71 #endif
72
73 #ifdef SSE_POSSIBLE
74 #include <emmintrin.h>
75
76 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
77
78 static void *MM_CALLOC(size_t nmemb, size_t size)
79 {
80         void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
81         if (ptr != NULL) memset(ptr, 0, nmemb*size);
82         return ptr;
83 }
84
85 #define MM_FREE _mm_free
86 #else
87 #define MM_MALLOC(size) malloc(size)
88 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
89 #define MM_FREE free
90 #endif
91
92 typedef enum DPSOFTRAST_ARRAY_e
93 {
94         DPSOFTRAST_ARRAY_POSITION,
95         DPSOFTRAST_ARRAY_COLOR,
96         DPSOFTRAST_ARRAY_TEXCOORD0,
97         DPSOFTRAST_ARRAY_TEXCOORD1,
98         DPSOFTRAST_ARRAY_TEXCOORD2,
99         DPSOFTRAST_ARRAY_TEXCOORD3,
100         DPSOFTRAST_ARRAY_TEXCOORD4,
101         DPSOFTRAST_ARRAY_TEXCOORD5,
102         DPSOFTRAST_ARRAY_TEXCOORD6,
103         DPSOFTRAST_ARRAY_TEXCOORD7,
104         DPSOFTRAST_ARRAY_TOTAL
105 }
106 DPSOFTRAST_ARRAY;
107
108 typedef struct DPSOFTRAST_Texture_s
109 {
110         int flags;
111         int width;
112         int height;
113         int depth;
114         int sides;
115         DPSOFTRAST_TEXTURE_FILTER filter;
116         int mipmaps;
117         int size;
118         ATOMIC_COUNTER binds;
119         unsigned char *bytes;
120         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
121 }
122 DPSOFTRAST_Texture;
123
124 #define COMMAND_SIZE ALIGN_SIZE
125 #define COMMAND_ALIGN(var) ALIGN(var)
126
127 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
128 {
129         unsigned char opcode;
130         unsigned short commandsize;
131 }
132 DPSOFTRAST_Command);
133
134 enum { DPSOFTRAST_OPCODE_Reset = 0 };
135
136 #define DEFCOMMAND(opcodeval, name, fields) \
137         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
138         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
139         { \
140                 unsigned char opcode; \
141                 unsigned short commandsize; \
142                 fields \
143         } DPSOFTRAST_Command_##name );
144
145 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
146 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
147
148 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
149 {
150         int freecommand;
151         int usedcommands;
152         ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
153 }
154 DPSOFTRAST_State_Command_Pool);
155
156 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
157 {
158         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
159         float w[3];
160         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
161 }
162 DPSOFTRAST_State_Triangle);
163
164 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
165         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
166         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
167                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
168                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
169 }
170 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
171         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
172         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
173         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
174         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
175         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
176         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
177         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
178         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
179 }
180                                         
181 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
182
183 typedef ALIGN(struct DPSOFTRAST_State_Span_s
184 {
185         int triangle; // triangle this span was generated by
186         int x; // framebuffer x coord
187         int y; // framebuffer y coord
188         int startx; // usable range (according to pixelmask)
189         int endx; // usable range (according to pixelmask)
190         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
191 }
192 DPSOFTRAST_State_Span);
193
194 #define DPSOFTRAST_DRAW_MAXSPANS 1024
195 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
196
197 #define DPSOFTRAST_VALIDATE_FB 1
198 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
199 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
200 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
201
202 typedef enum DPSOFTRAST_BLENDMODE_e
203 {
204         DPSOFTRAST_BLENDMODE_OPAQUE,
205         DPSOFTRAST_BLENDMODE_ALPHA,
206         DPSOFTRAST_BLENDMODE_ADDALPHA,
207         DPSOFTRAST_BLENDMODE_ADD,
208         DPSOFTRAST_BLENDMODE_INVMOD,
209         DPSOFTRAST_BLENDMODE_MUL,
210         DPSOFTRAST_BLENDMODE_MUL2,
211         DPSOFTRAST_BLENDMODE_SUBALPHA,
212         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
213         DPSOFTRAST_BLENDMODE_INVADD,
214         DPSOFTRAST_BLENDMODE_TOTAL
215 }
216 DPSOFTRAST_BLENDMODE;
217
218 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
219 {
220         void *thread;
221         int index;
222         
223         int cullface;
224         int colormask[4];
225         int blendfunc[2];
226         int blendsubtract;
227         int depthmask;
228         int depthtest;
229         int depthfunc;
230         int scissortest;
231         int alphatest;
232         int alphafunc;
233         float alphavalue;
234         int viewport[4];
235         int scissor[4];
236         float depthrange[2];
237         float polygonoffset[2];
238         float clipplane[4];
239         ALIGN(float fb_clipplane[4]);
240
241         int shader_mode;
242         int shader_permutation;
243         int shader_exactspecularmath;
244
245         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
246         
247         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
248         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
249
250         // DPSOFTRAST_VALIDATE_ flags
251         int validate;
252
253         // derived values (DPSOFTRAST_VALIDATE_FB)
254         int fb_colormask;
255         int fb_scissor[4];
256         ALIGN(float fb_viewportcenter[4]);
257         ALIGN(float fb_viewportscale[4]);
258
259         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
260         int fb_depthfunc;
261
262         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
263         int fb_blendmode;
264
265         // band boundaries
266         int miny1;
267         int maxy1;
268         int miny2;
269         int maxy2;
270
271         ATOMIC(volatile int commandoffset);
272
273         volatile bool waiting;
274         volatile bool starving;
275         void *waitcond;
276         void *drawcond;
277         void *drawmutex;
278
279         int numspans;
280         int numtriangles;
281         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
282         DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
283 }
284 DPSOFTRAST_State_Thread);
285
286 typedef ATOMIC(struct DPSOFTRAST_State_s
287 {
288         int fb_width;
289         int fb_height;
290         unsigned int *fb_depthpixels;
291         unsigned int *fb_colorpixels[4];
292
293         int viewport[4];
294         ALIGN(float fb_viewportcenter[4]);
295         ALIGN(float fb_viewportscale[4]);
296
297         float color[4];
298         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
299         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
300
301         const float *pointer_vertex3f;
302         const float *pointer_color4f;
303         const unsigned char *pointer_color4ub;
304         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
305         int stride_vertex;
306         int stride_color;
307         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
308         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
309         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
310
311         int firstvertex;
312         int numvertices;
313         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
314         float *screencoord4f;
315         int drawstarty;
316         int drawendy;
317         int drawclipped;
318         
319         int shader_mode;
320         int shader_permutation;
321         int shader_exactspecularmath;
322
323         int texture_max;
324         int texture_end;
325         int texture_firstfree;
326         DPSOFTRAST_Texture *texture;
327
328         int bigendian;
329
330         // error reporting
331         const char *errorstring;
332
333         bool usethreads;
334         int interlace;
335         int numthreads;
336         DPSOFTRAST_State_Thread *threads;
337
338         ATOMIC(volatile int drawcommand);
339
340         DPSOFTRAST_State_Command_Pool commandpool;
341 }
342 DPSOFTRAST_State);
343
344 DPSOFTRAST_State dpsoftrast;
345
346 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
347 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
348 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
349 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
350 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
351
352 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
353 {
354         fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
355         fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
356         fb_viewportcenter[3] = 0.5f;
357         fb_viewportcenter[0] = 0.0f;
358         fb_viewportscale[1] = 0.5f * viewport[2];
359         fb_viewportscale[2] = -0.5f * viewport[3];
360         fb_viewportscale[3] = 0.5f;
361         fb_viewportscale[0] = 1.0f;
362 }
363
364 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
365 {
366         if (dpsoftrast.interlace)
367         {
368                 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
369                 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
370                 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
371                 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
372         }
373         else
374         {
375                 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
376                 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
377         }
378 }
379
380 static void DPSOFTRAST_RecalcClipPlane(DPSOFTRAST_State_Thread *thread)
381 {
382         thread->fb_clipplane[0] = thread->clipplane[0] / thread->fb_viewportscale[1];
383         thread->fb_clipplane[1] = thread->clipplane[1] / thread->fb_viewportscale[2];
384         thread->fb_clipplane[2] = thread->clipplane[2] / thread->fb_viewportscale[3];
385         thread->fb_clipplane[3] = thread->clipplane[3] / thread->fb_viewportscale[0];
386         thread->fb_clipplane[3] -= thread->fb_viewportcenter[1]*thread->fb_clipplane[0] + thread->fb_viewportcenter[2]*thread->fb_clipplane[1] + thread->fb_viewportcenter[3]*thread->fb_clipplane[2] + thread->fb_viewportcenter[0]*thread->fb_clipplane[3];
387 }
388
389 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
390 {
391         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
392         // and viewport projection values
393         int x1, x2;
394         int y1, y2;
395         x1 = thread->scissor[0];
396         x2 = thread->scissor[0] + thread->scissor[2];
397         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
398         y2 = dpsoftrast.fb_height - thread->scissor[1];
399         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
400         if (x1 < 0) x1 = 0;
401         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
402         if (y1 < 0) y1 = 0;
403         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
404         thread->fb_scissor[0] = x1;
405         thread->fb_scissor[1] = y1;
406         thread->fb_scissor[2] = x2 - x1;
407         thread->fb_scissor[3] = y2 - y1;
408
409         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
410         DPSOFTRAST_RecalcClipPlane(thread);
411         DPSOFTRAST_RecalcThread(thread);
412 }
413
414 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
415 {
416         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
417 }
418
419 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
420 {
421         if (thread->blendsubtract)
422         {
423                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
424                 {
425                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
426                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
427                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
428                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
429                 }
430         }
431         else
432         {       
433                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
434                 {
435                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
436                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
437                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
438                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
439                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
440                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
441                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
442                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
443                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
444                 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
445                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
446                 }
447         }
448 }
449
450 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
451
452 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
453 {
454         mask &= thread->validate;
455         if (!mask)
456                 return;
457         if (mask & DPSOFTRAST_VALIDATE_FB)
458         {
459                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
460                 DPSOFTRAST_RecalcFB(thread);
461         }
462         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
463         {
464                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
465                 DPSOFTRAST_RecalcDepthFunc(thread);
466         }
467         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
468         {
469                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
470                 DPSOFTRAST_RecalcBlendFunc(thread);
471         }
472 }
473
474 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
475 {
476         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
477                 return &dpsoftrast.texture[index];
478         return NULL;
479 }
480
481 static void DPSOFTRAST_Texture_Grow(void)
482 {
483         DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
484         DPSOFTRAST_State_Thread *thread;
485         int i;
486         int j;
487         DPSOFTRAST_Flush();
488         // expand texture array as needed
489         if (dpsoftrast.texture_max < 1024)
490                 dpsoftrast.texture_max = 1024;
491         else
492                 dpsoftrast.texture_max *= 2;
493         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
494         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
495                 if (dpsoftrast.texbound[i])
496                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
497         for (j = 0; j < dpsoftrast.numthreads; j++)
498         {
499                 thread = &dpsoftrast.threads[j];
500                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
501                         if (thread->texbound[i])
502                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
503         }
504 }
505
506 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
507 {
508         int w;
509         int h;
510         int d;
511         int size;
512         int s;
513         int texnum;
514         int mipmaps;
515         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
516         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
517         DPSOFTRAST_Texture *texture;
518         if (width*height*depth < 1)
519         {
520                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
521                 return 0;
522         }
523         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
524         {
525                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
526                 return 0;
527         }
528         switch(texformat)
529         {
530         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
531         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
532         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
533                 break;
534         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
535                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
536                 {
537                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
538                         return 0;
539                 }
540                 if (depth != 1)
541                 {
542                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
543                         return 0;
544                 }
545                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
546                 {
547                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
548                         return 0;
549                 }
550                 break;
551         }
552         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
553         {
554                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
555                 return 0;
556         }
557         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
558         {
559                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
560                 return 0;
561         }
562         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
563         {
564                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
565                 return 0;
566         }
567         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
568         {
569                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
570                 return 0;
571         }
572         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
573         {
574                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
575                 return 0;
576         }
577         // find first empty slot in texture array
578         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
579                 if (!dpsoftrast.texture[texnum].bytes)
580                         break;
581         dpsoftrast.texture_firstfree = texnum + 1;
582         if (dpsoftrast.texture_max <= texnum)
583                 DPSOFTRAST_Texture_Grow();
584         if (dpsoftrast.texture_end <= texnum)
585                 dpsoftrast.texture_end = texnum + 1;
586         texture = &dpsoftrast.texture[texnum];
587         memset(texture, 0, sizeof(*texture));
588         texture->flags = flags;
589         texture->width = width;
590         texture->height = height;
591         texture->depth = depth;
592         texture->sides = sides;
593         texture->binds = 0;
594         w = width;
595         h = height;
596         d = depth;
597         size = 0;
598         mipmaps = 0;
599         w = width;
600         h = height;
601         d = depth;
602         for (;;)
603         {
604                 s = w * h * d * sides * 4;
605                 texture->mipmap[mipmaps][0] = size;
606                 texture->mipmap[mipmaps][1] = s;
607                 texture->mipmap[mipmaps][2] = w;
608                 texture->mipmap[mipmaps][3] = h;
609                 texture->mipmap[mipmaps][4] = d;
610                 size += s;
611                 mipmaps++;
612                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
613                         break;
614                 if (w > 1) w >>= 1;
615                 if (h > 1) h >>= 1;
616                 if (d > 1) d >>= 1;
617         }
618         texture->mipmaps = mipmaps;
619         texture->size = size;
620
621         // allocate the pixels now
622         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
623
624         return texnum;
625 }
626 void DPSOFTRAST_Texture_Free(int index)
627 {
628         DPSOFTRAST_Texture *texture;
629         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
630         if (texture->binds)
631                 DPSOFTRAST_Flush();
632         if (texture->bytes)
633                 MM_FREE(texture->bytes);
634         texture->bytes = NULL;
635         memset(texture, 0, sizeof(*texture));
636         // adjust the free range and used range
637         if (dpsoftrast.texture_firstfree > index)
638                 dpsoftrast.texture_firstfree = index;
639         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
640                 dpsoftrast.texture_end--;
641 }
642 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
643 {
644         int i, x, y, z, w, layer0, layer1, row0, row1;
645         unsigned char *o, *i0, *i1, *i2, *i3;
646         DPSOFTRAST_Texture *texture;
647         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
648         if (texture->mipmaps <= 1)
649                 return;
650         for (i = 1;i < texture->mipmaps;i++)
651         {
652                 for (z = 0;z < texture->mipmap[i][4];z++)
653                 {
654                         layer0 = z*2;
655                         layer1 = z*2+1;
656                         if (layer1 >= texture->mipmap[i-1][4])
657                                 layer1 = texture->mipmap[i-1][4]-1;
658                         for (y = 0;y < texture->mipmap[i][3];y++)
659                         {
660                                 row0 = y*2;
661                                 row1 = y*2+1;
662                                 if (row1 >= texture->mipmap[i-1][3])
663                                         row1 = texture->mipmap[i-1][3]-1;
664                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
665                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
666                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
667                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
668                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
669                                 w = texture->mipmap[i][2];
670                                 if (layer1 > layer0)
671                                 {
672                                         if (texture->mipmap[i-1][2] > 1)
673                                         {
674                                                 // average 3D texture
675                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
676                                                 {
677                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
678                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
679                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
680                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
681                                                 }
682                                         }
683                                         else
684                                         {
685                                                 // average 3D mipmap with parent width == 1
686                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
687                                                 {
688                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
689                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
690                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
691                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
692                                                 }
693                                         }
694                                 }
695                                 else
696                                 {
697                                         if (texture->mipmap[i-1][2] > 1)
698                                         {
699                                                 // average 2D texture (common case)
700                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
701                                                 {
702                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
703                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
704                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
705                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
706                                                 }
707                                         }
708                                         else
709                                         {
710                                                 // 2D texture with parent width == 1
711                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
712                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
713                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
714                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
715                                         }
716                                 }
717                         }
718                 }
719         }
720 }
721 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
722 {
723         DPSOFTRAST_Texture *texture;
724         unsigned char *dst;
725         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
726         if (texture->binds)
727                 DPSOFTRAST_Flush();
728         if (pixels)
729         {
730                 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
731                 while (blockheight > 0)
732                 {
733                         memcpy(dst, pixels, blockwidth * 4);
734                         pixels += blockwidth * 4;
735                         dst += texture->mipmap[0][2] * 4;
736                         blockheight--;
737                 }
738         }
739         DPSOFTRAST_Texture_CalculateMipmaps(index);
740 }
741 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
742 {
743         DPSOFTRAST_Texture *texture;
744         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
745         if (texture->binds)
746                 DPSOFTRAST_Flush();
747         if (pixels)
748                 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
749         DPSOFTRAST_Texture_CalculateMipmaps(index);
750 }
751 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
752 {
753         DPSOFTRAST_Texture *texture;
754         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
755         return texture->mipmap[mip][2];
756 }
757 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
758 {
759         DPSOFTRAST_Texture *texture;
760         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
761         return texture->mipmap[mip][3];
762 }
763 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
764 {
765         DPSOFTRAST_Texture *texture;
766         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
767         return texture->mipmap[mip][4];
768 }
769 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
770 {
771         DPSOFTRAST_Texture *texture;
772         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
773         if (texture->binds)
774                 DPSOFTRAST_Flush();
775         return texture->bytes + texture->mipmap[mip][0];
776 }
777 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
778 {
779         DPSOFTRAST_Texture *texture;
780         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
781         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
782         {
783                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
784                 return;
785         }
786         if (texture->binds)
787                 DPSOFTRAST_Flush();
788         texture->filter = filter;
789 }
790
791 static void DPSOFTRAST_Draw_FlushThreads(void);
792
793 static void DPSOFTRAST_Draw_SyncCommands(void)
794 {
795         if(dpsoftrast.usethreads) MEMORY_BARRIER;
796         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
797 }
798
799 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
800 {
801         DPSOFTRAST_State_Thread *thread;
802         int i;
803         int freecommand = dpsoftrast.commandpool.freecommand;
804         int usedcommands = dpsoftrast.commandpool.usedcommands;
805         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
806                 return;
807         DPSOFTRAST_Draw_SyncCommands();
808         for(;;)
809         {
810                 int waitindex = -1;
811                 int commandoffset;
812                 usedcommands = 0;
813                 for (i = 0; i < dpsoftrast.numthreads; i++)
814                 {
815                         thread = &dpsoftrast.threads[i]; 
816                         commandoffset = freecommand - thread->commandoffset;
817                         if (commandoffset < 0)
818                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
819                         if (commandoffset > usedcommands)
820                         {
821                                 waitindex = i;
822                                 usedcommands = commandoffset;
823                         }
824                 }
825                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
826                         break;
827                 thread = &dpsoftrast.threads[waitindex];
828                 Thread_LockMutex(thread->drawmutex);
829                 if (thread->commandoffset != dpsoftrast.drawcommand)
830                 {
831                         thread->waiting = true;
832                         if (thread->starving) Thread_CondSignal(thread->drawcond);
833                         Thread_CondWait(thread->waitcond, thread->drawmutex);
834                         thread->waiting = false;
835                 }
836                 Thread_UnlockMutex(thread->drawmutex);
837         }
838         dpsoftrast.commandpool.usedcommands = usedcommands;
839 }
840
841 #define DPSOFTRAST_ALIGNCOMMAND(size) \
842         ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
843 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
844         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
845
846 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
847 {
848         DPSOFTRAST_Command *command;
849         int freecommand = dpsoftrast.commandpool.freecommand;
850         int usedcommands = dpsoftrast.commandpool.usedcommands;
851         int extra = sizeof(DPSOFTRAST_Command);
852         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
853                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
854         if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
855         {
856                 if (dpsoftrast.usethreads)
857                         DPSOFTRAST_Draw_FreeCommandPool(size + extra);
858                 else
859                         DPSOFTRAST_Draw_FlushThreads();
860                 freecommand = dpsoftrast.commandpool.freecommand;
861                 usedcommands = dpsoftrast.commandpool.usedcommands;
862         }
863         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
864         {
865                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
866                 command->opcode = DPSOFTRAST_OPCODE_Reset;
867                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
868                 freecommand = 0;
869         }
870         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
871         command->opcode = opcode;
872         command->commandsize = size;
873         freecommand += size;
874         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
875                 freecommand = 0;
876         dpsoftrast.commandpool.freecommand = freecommand;
877         dpsoftrast.commandpool.usedcommands = usedcommands + size;
878         return command;
879 }
880
881 static void DPSOFTRAST_UndoCommand(int size)
882 {
883         int freecommand = dpsoftrast.commandpool.freecommand;
884         int usedcommands = dpsoftrast.commandpool.usedcommands;
885         freecommand -= size;
886         if (freecommand < 0)
887                 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
888         usedcommands -= size;
889         dpsoftrast.commandpool.freecommand = freecommand;
890         dpsoftrast.commandpool.usedcommands = usedcommands;
891 }
892                 
893 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
894 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
895 {
896         thread->viewport[0] = command->x;
897         thread->viewport[1] = command->y;
898         thread->viewport[2] = command->width;
899         thread->viewport[3] = command->height;
900         thread->validate |= DPSOFTRAST_VALIDATE_FB;
901 }
902 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
903 {
904         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
905         command->x = x;
906         command->y = y;
907         command->width = width;
908         command->height = height;
909
910         dpsoftrast.viewport[0] = x;
911         dpsoftrast.viewport[1] = y;
912         dpsoftrast.viewport[2] = width;
913         dpsoftrast.viewport[3] = height;
914         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
915 }
916
917 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
918 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
919 {
920         int i, x1, y1, x2, y2, w, h, x, y;
921         int miny1, maxy1, miny2, maxy2;
922         int bandy;
923         unsigned int *p;
924         unsigned int c;
925         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
926         miny1 = thread->miny1;
927         maxy1 = thread->maxy1;
928         miny2 = thread->miny2;
929         maxy2 = thread->maxy2;
930         x1 = thread->fb_scissor[0];
931         y1 = thread->fb_scissor[1];
932         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
933         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
934         if (y1 < miny1) y1 = miny1;
935         if (y2 > maxy2) y2 = maxy2;
936         w = x2 - x1;
937         h = y2 - y1;
938         if (w < 1 || h < 1)
939                 return;
940         // FIXME: honor fb_colormask?
941         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
942         for (i = 0;i < 4;i++)
943         {
944                 if (!dpsoftrast.fb_colorpixels[i])
945                         continue;
946                 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
947                 for (;y < bandy;y++)
948                 {
949                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
950                         for (x = x1;x < x2;x++)
951                                 p[x] = c;
952                 }
953         }
954 }
955 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
956 {
957         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
958         command->r = r;
959         command->g = g;
960         command->b = b;
961         command->a = a;
962 }
963
964 DEFCOMMAND(3, ClearDepth, float depth;)
965 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
966 {
967         int x1, y1, x2, y2, w, h, x, y;
968         int miny1, maxy1, miny2, maxy2;
969         int bandy;
970         unsigned int *p;
971         unsigned int c;
972         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
973         miny1 = thread->miny1;
974         maxy1 = thread->maxy1;
975         miny2 = thread->miny2;
976         maxy2 = thread->maxy2;
977         x1 = thread->fb_scissor[0];
978         y1 = thread->fb_scissor[1];
979         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
980         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
981         if (y1 < miny1) y1 = miny1;
982         if (y2 > maxy2) y2 = maxy2;
983         w = x2 - x1;
984         h = y2 - y1;
985         if (w < 1 || h < 1)
986                 return;
987         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
988         for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
989         for (;y < bandy;y++)
990         {
991                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
992                 for (x = x1;x < x2;x++)
993                         p[x] = c;
994         }
995 }
996 void DPSOFTRAST_ClearDepth(float d)
997 {
998         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
999         command->depth = d;
1000 }
1001
1002 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
1003 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
1004 {
1005         thread->colormask[0] = command->r != 0;
1006         thread->colormask[1] = command->g != 0;
1007         thread->colormask[2] = command->b != 0;
1008         thread->colormask[3] = command->a != 0;
1009         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
1010 }
1011 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1012 {
1013         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1014         command->r = r;
1015         command->g = g;
1016         command->b = b;
1017         command->a = a;
1018 }
1019
1020 DEFCOMMAND(5, DepthTest, int enable;)
1021 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1022 {
1023         thread->depthtest = command->enable;
1024         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1025 }
1026 void DPSOFTRAST_DepthTest(int enable)
1027 {
1028         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1029         command->enable = enable;
1030 }
1031
1032 DEFCOMMAND(6, ScissorTest, int enable;)
1033 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1034 {
1035         thread->scissortest = command->enable;
1036         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1037 }
1038 void DPSOFTRAST_ScissorTest(int enable)
1039 {
1040         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1041         command->enable = enable;
1042 }
1043
1044 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1045 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1046 {
1047         thread->scissor[0] = command->x;
1048         thread->scissor[1] = command->y;
1049         thread->scissor[2] = command->width;
1050         thread->scissor[3] = command->height;
1051         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1052 }
1053 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1054 {
1055         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1056         command->x = x;
1057         command->y = y;
1058         command->width = width;
1059         command->height = height;
1060 }
1061
1062 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1063 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1064 {
1065         thread->blendfunc[0] = command->sfactor;
1066         thread->blendfunc[1] = command->dfactor;
1067         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1068 }
1069 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1070 {
1071         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1072         command->sfactor = sfactor;
1073         command->dfactor = dfactor;
1074 }
1075
1076 DEFCOMMAND(9, BlendSubtract, int enable;)
1077 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1078 {
1079         thread->blendsubtract = command->enable;
1080         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1081 }
1082 void DPSOFTRAST_BlendSubtract(int enable)
1083 {
1084         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1085         command->enable = enable;
1086 }
1087
1088 DEFCOMMAND(10, DepthMask, int enable;)
1089 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1090 {
1091         thread->depthmask = command->enable;
1092 }
1093 void DPSOFTRAST_DepthMask(int enable)
1094 {
1095         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1096         command->enable = enable;
1097 }
1098
1099 DEFCOMMAND(11, DepthFunc, int func;)
1100 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1101 {
1102         thread->depthfunc = command->func;
1103 }
1104 void DPSOFTRAST_DepthFunc(int func)
1105 {
1106         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1107         command->func = func;
1108 }
1109
1110 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1111 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1112 {
1113         thread->depthrange[0] = command->nearval;
1114         thread->depthrange[1] = command->farval;
1115 }
1116 void DPSOFTRAST_DepthRange(float nearval, float farval)
1117 {
1118         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1119         command->nearval = nearval;
1120         command->farval = farval;
1121 }
1122
1123 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1124 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1125 {
1126         thread->polygonoffset[0] = command->alongnormal;
1127         thread->polygonoffset[1] = command->intoview;
1128 }
1129 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1130 {
1131         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1132         command->alongnormal = alongnormal;
1133         command->intoview = intoview;
1134 }
1135
1136 DEFCOMMAND(14, CullFace, int mode;)
1137 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1138 {
1139         thread->cullface = command->mode;
1140 }
1141 void DPSOFTRAST_CullFace(int mode)
1142 {
1143         DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1144         command->mode = mode;
1145 }
1146
1147 DEFCOMMAND(15, AlphaTest, int enable;)
1148 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1149 {
1150         thread->alphatest = command->enable;
1151 }
1152 void DPSOFTRAST_AlphaTest(int enable)
1153 {
1154         DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1155         command->enable = enable;
1156 }
1157
1158 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1159 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1160 {
1161         thread->alphafunc = command->func;
1162         thread->alphavalue = command->ref;
1163 }
1164 void DPSOFTRAST_AlphaFunc(int func, float ref)
1165 {
1166         DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1167         command->func = func;
1168         command->ref = ref;
1169 }
1170
1171 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1172 {
1173         dpsoftrast.color[0] = r;
1174         dpsoftrast.color[1] = g;
1175         dpsoftrast.color[2] = b;
1176         dpsoftrast.color[3] = a;
1177 }
1178
1179 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1180 {
1181         int outstride = blockwidth * 4;
1182         int instride = dpsoftrast.fb_width * 4;
1183         int bx1 = blockx;
1184         int by1 = blocky;
1185         int bx2 = blockx + blockwidth;
1186         int by2 = blocky + blockheight;
1187         int bw;
1188         int x;
1189         int y;
1190         unsigned char *inpixels;
1191         unsigned char *b;
1192         unsigned char *o;
1193         DPSOFTRAST_Flush();
1194         if (bx1 < 0) bx1 = 0;
1195         if (by1 < 0) by1 = 0;
1196         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1197         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1198         bw = bx2 - bx1;
1199         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1200         if (dpsoftrast.bigendian)
1201         {
1202                 for (y = by1;y < by2;y++)
1203                 {
1204                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1205                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1206                         for (x = bx1;x < bx2;x++)
1207                         {
1208                                 o[0] = b[3];
1209                                 o[1] = b[2];
1210                                 o[2] = b[1];
1211                                 o[3] = b[0];
1212                                 o += 4;
1213                                 b += 4;
1214                         }
1215                 }
1216         }
1217         else
1218         {
1219                 for (y = by1;y < by2;y++)
1220                 {
1221                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1222                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1223                         memcpy(o, b, bw*4);
1224                 }
1225         }
1226
1227 }
1228 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1229 {
1230         int tx1 = tx;
1231         int ty1 = ty;
1232         int tx2 = tx + width;
1233         int ty2 = ty + height;
1234         int sx1 = sx;
1235         int sy1 = sy;
1236         int sx2 = sx + width;
1237         int sy2 = sy + height;
1238         int swidth;
1239         int sheight;
1240         int twidth;
1241         int theight;
1242         int sw;
1243         int sh;
1244         int tw;
1245         int th;
1246         int y;
1247         unsigned int *spixels;
1248         unsigned int *tpixels;
1249         DPSOFTRAST_Texture *texture;
1250         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1251         if (mip < 0 || mip >= texture->mipmaps) return;
1252         DPSOFTRAST_Flush();
1253         spixels = dpsoftrast.fb_colorpixels[0];
1254         swidth = dpsoftrast.fb_width;
1255         sheight = dpsoftrast.fb_height;
1256         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1257         twidth = texture->mipmap[mip][2];
1258         theight = texture->mipmap[mip][3];
1259         if (tx1 < 0) tx1 = 0;
1260         if (ty1 < 0) ty1 = 0;
1261         if (tx2 > twidth) tx2 = twidth;
1262         if (ty2 > theight) ty2 = theight;
1263         if (sx1 < 0) sx1 = 0;
1264         if (sy1 < 0) sy1 = 0;
1265         if (sx2 > swidth) sx2 = swidth;
1266         if (sy2 > sheight) sy2 = sheight;
1267         tw = tx2 - tx1;
1268         th = ty2 - ty1;
1269         sw = sx2 - sx1;
1270         sh = sy2 - sy1;
1271         if (tw > sw) tw = sw;
1272         if (th > sh) th = sh;
1273         if (tw < 1 || th < 1)
1274                 return;
1275         sy1 = sheight - 1 - sy1;
1276         for (y = 0;y < th;y++)
1277                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1278         if (texture->mipmaps > 1)
1279                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1280 }
1281
1282 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1283 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1284 {
1285         if (thread->texbound[command->unitnum])
1286                 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1287         thread->texbound[command->unitnum] = command->texture;
1288 }
1289 void DPSOFTRAST_SetTexture(int unitnum, int index)
1290 {
1291         DPSOFTRAST_Command_SetTexture *command;
1292         DPSOFTRAST_Texture *texture;
1293         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1294         {
1295                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1296                 return;
1297         }
1298         texture = DPSOFTRAST_Texture_GetByIndex(index);
1299         if (index && !texture)
1300         {
1301                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1302                 return;
1303         }
1304
1305         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1306         command->unitnum = unitnum;
1307         command->texture = texture;
1308
1309         dpsoftrast.texbound[unitnum] = texture;
1310         ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1311 }
1312
1313 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1314 {
1315         dpsoftrast.pointer_vertex3f = vertex3f;
1316         dpsoftrast.stride_vertex = stride;
1317 }
1318 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1319 {
1320         dpsoftrast.pointer_color4f = color4f;
1321         dpsoftrast.pointer_color4ub = NULL;
1322         dpsoftrast.stride_color = stride;
1323 }
1324 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1325 {
1326         dpsoftrast.pointer_color4f = NULL;
1327         dpsoftrast.pointer_color4ub = color4ub;
1328         dpsoftrast.stride_color = stride;
1329 }
1330 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1331 {
1332         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1333         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1334         dpsoftrast.stride_texcoord[unitnum] = stride;
1335 }
1336
1337 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1338 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1339 {
1340         thread->shader_mode = command->mode;
1341         thread->shader_permutation = command->permutation;
1342         thread->shader_exactspecularmath = command->exactspecularmath;
1343 }
1344 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1345 {
1346         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1347         command->mode = mode;
1348         command->permutation = permutation;
1349         command->exactspecularmath = exactspecularmath;
1350
1351         dpsoftrast.shader_mode = mode;
1352         dpsoftrast.shader_permutation = permutation;
1353         dpsoftrast.shader_exactspecularmath = exactspecularmath;
1354 }
1355
1356 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1357 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1358 {
1359         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1360 }
1361 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1362 {
1363         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1364         command->index = index;
1365         command->val[0] = v0;
1366         command->val[1] = v1;
1367         command->val[2] = v2;
1368         command->val[3] = v3;
1369
1370         dpsoftrast.uniform4f[index*4+0] = v0;
1371         dpsoftrast.uniform4f[index*4+1] = v1;
1372         dpsoftrast.uniform4f[index*4+2] = v2;
1373         dpsoftrast.uniform4f[index*4+3] = v3;
1374 }
1375 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1376 {
1377         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1378         command->index = index;
1379         memcpy(command->val, v, sizeof(command->val));
1380
1381         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1382 }
1383
1384 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1385 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1386 {
1387         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1388 }
1389 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1390 {
1391 #ifdef SSE_POSSIBLE
1392         int i, index;
1393         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1394         {
1395                 __m128 m0, m1, m2, m3;
1396                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1397                 command->index = (DPSOFTRAST_UNIFORM)index;
1398                 if (((size_t)v)&(ALIGN_SIZE-1))
1399                 {
1400                         m0 = _mm_loadu_ps(v);
1401                         m1 = _mm_loadu_ps(v+4);
1402                         m2 = _mm_loadu_ps(v+8);
1403                         m3 = _mm_loadu_ps(v+12);
1404                 }
1405                 else
1406                 {
1407                         m0 = _mm_load_ps(v);
1408                         m1 = _mm_load_ps(v+4);
1409                         m2 = _mm_load_ps(v+8);
1410                         m3 = _mm_load_ps(v+12);
1411                 }
1412                 if (transpose)
1413                 {
1414                         __m128 t0, t1, t2, t3;
1415                         t0 = _mm_unpacklo_ps(m0, m1);
1416                         t1 = _mm_unpacklo_ps(m2, m3);
1417                         t2 = _mm_unpackhi_ps(m0, m1);
1418                         t3 = _mm_unpackhi_ps(m2, m3);
1419                         m0 = _mm_movelh_ps(t0, t1);
1420                         m1 = _mm_movehl_ps(t1, t0);
1421                         m2 = _mm_movelh_ps(t2, t3);
1422                         m3 = _mm_movehl_ps(t3, t2);                     
1423                 }
1424                 _mm_store_ps(command->val, m0);
1425                 _mm_store_ps(command->val+4, m1);
1426                 _mm_store_ps(command->val+8, m2);
1427                 _mm_store_ps(command->val+12, m3);
1428                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1429                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1430                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1431                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1432         }
1433 #endif
1434 }
1435
1436 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1437 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1438 {
1439         thread->uniform1i[command->index] = command->val;
1440 }
1441 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1442 {
1443         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1444         command->index = index;
1445         command->val = i0;
1446
1447         dpsoftrast.uniform1i[command->index] = i0;
1448 }
1449
1450 DEFCOMMAND(24, ClipPlane, float clipplane[4];)
1451 static void DPSOFTRAST_Interpret_ClipPlane(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClipPlane *command)
1452 {
1453         memcpy(thread->clipplane, command->clipplane, 4*sizeof(float));
1454         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1455 }
1456 void DPSOFTRAST_ClipPlane(float x, float y, float z, float w)
1457 {
1458         DPSOFTRAST_Command_ClipPlane *command = DPSOFTRAST_ALLOCATECOMMAND(ClipPlane);
1459         command->clipplane[0] = x;
1460         command->clipplane[1] = y;
1461         command->clipplane[2] = z;
1462         command->clipplane[3] = w;
1463 }
1464
1465 #ifdef SSE_POSSIBLE
1466 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1467 {
1468         float *end = dst + size*4;
1469         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1470         {
1471                 while (dst < end)
1472                 {
1473                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1474                         dst += 4;
1475                         src += stride;
1476                 }
1477         }
1478         else
1479         {
1480                 while (dst < end)
1481                 {
1482                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1483                         dst += 4;
1484                         src += stride;
1485                 }
1486         }
1487 }
1488
1489 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1490 {
1491         float *end = dst + size*4;
1492         if (stride == sizeof(float[3]))
1493         {
1494                 float *end4 = dst + (size&~3)*4;        
1495                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1496                 {
1497                         while (dst < end4)
1498                         {
1499                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1500                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1501                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1502                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1503                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1504                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1505                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1506                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1507                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1508                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1509                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1510                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1511                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1512                                 dst += 16;
1513                                 src += 4*sizeof(float[3]);
1514                         }
1515                 }
1516                 else
1517                 {
1518                         while (dst < end4)
1519                         {
1520                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1521                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1522                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1523                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1524                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1525                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1526                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1527                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1528                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1529                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1530                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1531                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1532                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1533                                 dst += 16;
1534                                 src += 4*sizeof(float[3]);
1535                         }
1536                 }
1537         }
1538         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1539         {
1540                 while (dst < end)
1541                 {
1542                         __m128 v = _mm_loadu_ps((const float *)src);
1543                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1544                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1545                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1546                         _mm_store_ps(dst, v);
1547                         dst += 4;
1548                         src += stride;
1549                 }
1550         }
1551         else
1552         {
1553                 while (dst < end)
1554                 {
1555                         __m128 v = _mm_load_ps((const float *)src);
1556                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1557                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1558                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1559                         _mm_store_ps(dst, v);
1560                         dst += 4;
1561                         src += stride;
1562                 }
1563         }
1564 }
1565
1566 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1567 {
1568         float *end = dst + size*4;
1569         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1570         if (stride == sizeof(float[2]))
1571         {
1572                 float *end2 = dst + (size&~1)*4;
1573                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1574                 {
1575                         while (dst < end2)
1576                         {
1577                                 __m128 v = _mm_loadu_ps((const float *)src);
1578                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1579                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1580                                 dst += 8;
1581                                 src += 2*sizeof(float[2]);
1582                         }
1583                 }
1584                 else
1585                 {
1586                         while (dst < end2)
1587                         {
1588                                 __m128 v = _mm_load_ps((const float *)src);
1589                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1590                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1591                                 dst += 8;
1592                                 src += 2*sizeof(float[2]);
1593                         }
1594                 }
1595         }
1596         while (dst < end)
1597         {
1598                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1599                 dst += 4;
1600                 src += stride;
1601         }
1602 }
1603
1604 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1605 {
1606         float *end = dst + size*4;
1607         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1608         if (stride == sizeof(unsigned char[4]))
1609         {
1610                 float *end4 = dst + (size&~3)*4;
1611                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1612                 {
1613                         while (dst < end4)
1614                         {
1615                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1616                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1617                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1618                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1619                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1620                                 dst += 16;
1621                                 src += 4*sizeof(unsigned char[4]);
1622                         }
1623                 }
1624                 else
1625                 {
1626                         while (dst < end4)
1627                         {
1628                                 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1629                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1630                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1631                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1632                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1633                                 dst += 16;
1634                                 src += 4*sizeof(unsigned char[4]);
1635                         }
1636                 }
1637         }
1638         while (dst < end)
1639         {
1640                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1641                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1642                 dst += 4;
1643                 src += stride;
1644         }
1645 }
1646
1647 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1648 {
1649         float *end = dst + 4*size;
1650         __m128 v = _mm_loadu_ps(src);
1651         while (dst < end)
1652         {
1653                 _mm_store_ps(dst, v);
1654                 dst += 4;
1655         }
1656 }
1657 #endif
1658
1659 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1660 {
1661 #ifdef SSE_POSSIBLE
1662         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1663         __m128 m0, m1, m2, m3;
1664         float *end;
1665         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1666         {
1667                 // fast case for identity matrix
1668                 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1669                 return;
1670         }
1671         end = out4f + numitems*4;
1672         m0 = _mm_loadu_ps(inmatrix16f);
1673         m1 = _mm_loadu_ps(inmatrix16f + 4);
1674         m2 = _mm_loadu_ps(inmatrix16f + 8);
1675         m3 = _mm_loadu_ps(inmatrix16f + 12);
1676         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1677         {
1678                 while (out4f < end)
1679                 {
1680                         __m128 v = _mm_loadu_ps(in4f);
1681                         _mm_store_ps(out4f,
1682                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1683                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1684                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1685                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1686                         out4f += 4;
1687                         in4f += 4;
1688                 }
1689         }
1690         else
1691         {
1692                 while (out4f < end)
1693                 {
1694                         __m128 v = _mm_load_ps(in4f);
1695                         _mm_store_ps(out4f,
1696                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1697                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1698                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1699                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1700                         out4f += 4;
1701                         in4f += 4;
1702                 }
1703         }
1704 #endif
1705 }
1706
1707 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1708 {
1709         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1710 }
1711
1712 #ifdef SSE_POSSIBLE
1713 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1714 { \
1715         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1716         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1717         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1718         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1719 }
1720
1721 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1722 { \
1723         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1724         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1725         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1726         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1727 }
1728
1729 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1730 { \
1731         __m128 p = (in); \
1732         out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1733                                                   _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1734                                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1735                                                                                         _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1736 }
1737
1738 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1739 {
1740         int clipmask = 0xFF;
1741         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1742         __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1743         __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1744         __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1745         m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1746         m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1747         m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1748         m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1749         #define BBFRONT(k, pos) \
1750         { \
1751                 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1752                 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1753                 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1754                 { \
1755                         __m128 proj; \
1756                         clipmask &= ~(1<<k); \
1757                         proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1758                         minproj = _mm_min_ss(minproj, proj); \
1759                         maxproj = _mm_max_ss(maxproj, proj); \
1760                 } \
1761         }
1762         BBFRONT(0, minpos); 
1763         BBFRONT(1, _mm_move_ss(minpos, maxpos)); 
1764         BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1765         BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1766         BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1767         BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1768         BBFRONT(6, _mm_move_ss(maxpos, minpos)); 
1769         BBFRONT(7, maxpos);
1770         #define BBCLIP(k) \
1771         { \
1772                 if (clipmask&(1<<k)) \
1773                 { \
1774                         if (!(clipmask&(1<<(k^1)))) \
1775                         { \
1776                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1777                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1778                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1779                                 minproj = _mm_min_ss(minproj, proj); \
1780                                 maxproj = _mm_max_ss(maxproj, proj); \
1781                         } \
1782                         if (!(clipmask&(1<<(k^2)))) \
1783                         { \
1784                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1785                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1786                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1787                                 minproj = _mm_min_ss(minproj, proj); \
1788                                 maxproj = _mm_max_ss(maxproj, proj); \
1789                         } \
1790                         if (!(clipmask&(1<<(k^4)))) \
1791                         { \
1792                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1793                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1794                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1795                                 minproj = _mm_min_ss(minproj, proj); \
1796                                 maxproj = _mm_max_ss(maxproj, proj); \
1797                         } \
1798                 } \
1799         }
1800         BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1801         viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1802         viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1803         minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1804         maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1805         minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1806         maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1807         *starty = _mm_cvttss_si32(maxproj);
1808         *endy = _mm_cvttss_si32(minproj)+1;
1809         return clipmask;
1810 }
1811         
1812 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1813 {
1814         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1815         float *end = out4f + numitems*4;
1816         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1817         __m128 minpos, maxpos;
1818         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1819         {
1820                 minpos = maxpos = _mm_loadu_ps(in4f);
1821                 while (out4f < end)
1822                 {
1823                         __m128 v = _mm_loadu_ps(in4f);
1824                         minpos = _mm_min_ps(minpos, v);
1825                         maxpos = _mm_max_ps(maxpos, v);
1826                         _mm_store_ps(out4f, v);
1827                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1828                         _mm_store_ps(screen4f, v);
1829                         in4f += 4;
1830                         out4f += 4;
1831                         screen4f += 4;
1832                 }
1833         }
1834         else
1835         {
1836                 minpos = maxpos = _mm_load_ps(in4f);
1837                 while (out4f < end)
1838                 {
1839                         __m128 v = _mm_load_ps(in4f);
1840                         minpos = _mm_min_ps(minpos, v);
1841                         maxpos = _mm_max_ps(maxpos, v);
1842                         _mm_store_ps(out4f, v);
1843                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1844                         _mm_store_ps(screen4f, v);
1845                         in4f += 4;
1846                         out4f += 4;
1847                         screen4f += 4;
1848                 }
1849         }
1850         if (starty && endy) 
1851         {
1852                 ALIGN(float minposf[4]);
1853                 ALIGN(float maxposf[4]);
1854                 _mm_store_ps(minposf, minpos);
1855                 _mm_store_ps(maxposf, maxpos);
1856                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1857         }
1858         return 0;
1859 }
1860
1861 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1862 {
1863         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1864         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1865         float *end;
1866         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1867                 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1868         end = out4f + numitems*4;
1869         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1870         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1871         m0 = _mm_loadu_ps(inmatrix16f);
1872         m1 = _mm_loadu_ps(inmatrix16f + 4);
1873         m2 = _mm_loadu_ps(inmatrix16f + 8);
1874         m3 = _mm_loadu_ps(inmatrix16f + 12);
1875         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1876         {
1877                 minpos = maxpos = _mm_loadu_ps(in4f);
1878                 while (out4f < end)
1879                 {
1880                         __m128 v = _mm_loadu_ps(in4f);
1881                         minpos = _mm_min_ps(minpos, v);
1882                         maxpos = _mm_max_ps(maxpos, v);
1883                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1884                         _mm_store_ps(out4f, v);
1885                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1886                         _mm_store_ps(screen4f, v);
1887                         in4f += 4;
1888                         out4f += 4;
1889                         screen4f += 4;
1890                 }
1891         }
1892         else
1893         {
1894                 minpos = maxpos = _mm_load_ps(in4f);
1895                 while (out4f < end)
1896                 {
1897                         __m128 v = _mm_load_ps(in4f);
1898                         minpos = _mm_min_ps(minpos, v);
1899                         maxpos = _mm_max_ps(maxpos, v);
1900                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1901                         _mm_store_ps(out4f, v);
1902                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1903                         _mm_store_ps(screen4f, v);
1904                         in4f += 4;
1905                         out4f += 4;
1906                         screen4f += 4;
1907                 }
1908         }
1909         if (starty && endy) 
1910         {
1911                 ALIGN(float minposf[4]);
1912                 ALIGN(float maxposf[4]);
1913                 _mm_store_ps(minposf, minpos);
1914                 _mm_store_ps(maxposf, maxpos);
1915                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f); 
1916         }
1917         return 0;
1918 }
1919 #endif
1920
1921 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1922 {
1923 #ifdef SSE_POSSIBLE
1924         float *outf = dpsoftrast.post_array4f[outarray];
1925         const unsigned char *inb;
1926         int firstvertex = dpsoftrast.firstvertex;
1927         int numvertices = dpsoftrast.numvertices;
1928         int stride;
1929         switch(inarray)
1930         {
1931         case DPSOFTRAST_ARRAY_POSITION:
1932                 stride = dpsoftrast.stride_vertex;
1933                 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1934                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1935                 break;
1936         case DPSOFTRAST_ARRAY_COLOR:
1937                 stride = dpsoftrast.stride_color;
1938                 if (dpsoftrast.pointer_color4f)
1939                 {
1940                         inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1941                         DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1942                 }
1943                 else if (dpsoftrast.pointer_color4ub)
1944                 {
1945                         stride = dpsoftrast.stride_color;
1946                         inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1947                         DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1948                 }
1949                 else
1950                 {
1951                         DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1952                 }
1953                 break;
1954         default:
1955                 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1956                 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1957                 {
1958                         inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1959                         switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1960                         {
1961                         case 2:
1962                                 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1963                                 break;
1964                         case 3:
1965                                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1966                                 break;
1967                         case 4:
1968                                 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1969                                 break;
1970                         }
1971                 }
1972                 break;
1973         }
1974         return outf;
1975 #else
1976         return NULL;
1977 #endif
1978 }
1979
1980 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1981 {
1982         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1983         DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1984         return data;
1985 }
1986
1987 #if 0
1988 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1989 {
1990 #ifdef SSE_POSSIBLE
1991         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1992         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1993         return data;
1994 #else
1995         return NULL;
1996 #endif
1997 }
1998 #endif
1999
2000 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
2001 {
2002 #ifdef SSE_POSSIBLE
2003         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2004         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
2005         return data;
2006 #else
2007         return NULL;
2008 #endif
2009 }
2010
2011 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
2012 {
2013         int x;
2014         int startx = span->startx;
2015         int endx = span->endx;
2016         float wslope = triangle->w[0];
2017         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
2018         float endz = 1.0f / (w + wslope * startx);
2019         if (triangle->w[0] == 0)
2020         {
2021                 // LordHavoc: fast flat polygons (HUD/menu)
2022                 for (x = startx;x < endx;x++)
2023                         zf[x] = endz;
2024                 return;
2025         }
2026         for (x = startx;x < endx;)
2027         {
2028                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2029                 float z = endz, dz;
2030                 if (nextsub >= endx) nextsub = endsub = endx-1;
2031                 endz = 1.0f / (w + wslope * nextsub);
2032                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
2033                 for (; x <= endsub; x++, z += dz)
2034                         zf[x] = z;
2035         }
2036 }
2037
2038 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
2039 {
2040         int x;
2041         int startx = span->startx;
2042         int endx = span->endx;
2043         int d[4];
2044         float a, b;
2045         unsigned char * RESTRICT pixelmask = span->pixelmask;
2046         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2047         if (!pixel)
2048                 return;
2049         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2050         // handle alphatest now (this affects depth writes too)
2051         if (thread->alphatest)
2052                 for (x = startx;x < endx;x++)
2053                         if (in4f[x*4+3] < 0.5f)
2054                                 pixelmask[x] = false;
2055         // FIXME: this does not handle bigendian
2056         switch(thread->fb_blendmode)
2057         {
2058         case DPSOFTRAST_BLENDMODE_OPAQUE:
2059                 for (x = startx;x < endx;x++)
2060                 {
2061                         if (!pixelmask[x])
2062                                 continue;
2063                         d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
2064                         d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
2065                         d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
2066                         d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
2067                         pixel[x*4+0] = d[0];
2068                         pixel[x*4+1] = d[1];
2069                         pixel[x*4+2] = d[2];
2070                         pixel[x*4+3] = d[3];
2071                 }
2072                 break;
2073         case DPSOFTRAST_BLENDMODE_ALPHA:
2074                 for (x = startx;x < endx;x++)
2075                 {
2076                         if (!pixelmask[x])
2077                                 continue;
2078                         a = in4f[x*4+3] * 255.0f;
2079                         b = 1.0f - in4f[x*4+3];
2080                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2081                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2082                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2083                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2084                         pixel[x*4+0] = d[0];
2085                         pixel[x*4+1] = d[1];
2086                         pixel[x*4+2] = d[2];
2087                         pixel[x*4+3] = d[3];
2088                 }
2089                 break;
2090         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2091                 for (x = startx;x < endx;x++)
2092                 {
2093                         if (!pixelmask[x])
2094                                 continue;
2095                         a = in4f[x*4+3] * 255.0f;
2096                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2097                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2098                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2099                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2100                         pixel[x*4+0] = d[0];
2101                         pixel[x*4+1] = d[1];
2102                         pixel[x*4+2] = d[2];
2103                         pixel[x*4+3] = d[3];
2104                 }
2105                 break;
2106         case DPSOFTRAST_BLENDMODE_ADD:
2107                 for (x = startx;x < endx;x++)
2108                 {
2109                         if (!pixelmask[x])
2110                                 continue;
2111                         d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2112                         d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2113                         d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2114                         d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2115                         pixel[x*4+0] = d[0];
2116                         pixel[x*4+1] = d[1];
2117                         pixel[x*4+2] = d[2];
2118                         pixel[x*4+3] = d[3];
2119                 }
2120                 break;
2121         case DPSOFTRAST_BLENDMODE_INVMOD:
2122                 for (x = startx;x < endx;x++)
2123                 {
2124                         if (!pixelmask[x])
2125                                 continue;
2126                         d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2127                         d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2128                         d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2129                         d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2130                         pixel[x*4+0] = d[0];
2131                         pixel[x*4+1] = d[1];
2132                         pixel[x*4+2] = d[2];
2133                         pixel[x*4+3] = d[3];
2134                 }
2135                 break;
2136         case DPSOFTRAST_BLENDMODE_MUL:
2137                 for (x = startx;x < endx;x++)
2138                 {
2139                         if (!pixelmask[x])
2140                                 continue;
2141                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2142                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2143                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2144                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2145                         pixel[x*4+0] = d[0];
2146                         pixel[x*4+1] = d[1];
2147                         pixel[x*4+2] = d[2];
2148                         pixel[x*4+3] = d[3];
2149                 }
2150                 break;
2151         case DPSOFTRAST_BLENDMODE_MUL2:
2152                 for (x = startx;x < endx;x++)
2153                 {
2154                         if (!pixelmask[x])
2155                                 continue;
2156                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2157                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2158                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2159                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2160                         pixel[x*4+0] = d[0];
2161                         pixel[x*4+1] = d[1];
2162                         pixel[x*4+2] = d[2];
2163                         pixel[x*4+3] = d[3];
2164                 }
2165                 break;
2166         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2167                 for (x = startx;x < endx;x++)
2168                 {
2169                         if (!pixelmask[x])
2170                                 continue;
2171                         a = in4f[x*4+3] * -255.0f;
2172                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2173                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2174                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2175                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2176                         pixel[x*4+0] = d[0];
2177                         pixel[x*4+1] = d[1];
2178                         pixel[x*4+2] = d[2];
2179                         pixel[x*4+3] = d[3];
2180                 }
2181                 break;
2182         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2183                 for (x = startx;x < endx;x++)
2184                 {
2185                         if (!pixelmask[x])
2186                                 continue;
2187                         a = 255.0f;
2188                         b = 1.0f - in4f[x*4+3];
2189                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2190                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2191                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2192                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2193                         pixel[x*4+0] = d[0];
2194                         pixel[x*4+1] = d[1];
2195                         pixel[x*4+2] = d[2];
2196                         pixel[x*4+3] = d[3];
2197                 }
2198                 break;
2199         case DPSOFTRAST_BLENDMODE_INVADD:
2200                 for (x = startx;x < endx;x++)
2201                 {
2202                         if (!pixelmask[x])
2203                                 continue;
2204                         d[0] = (int)((255.0f-pixel[x*4+2])*in4f[x*4+0] + pixel[x*4+2]);if (d[0] > 255) d[0] = 255;
2205                         d[1] = (int)((255.0f-pixel[x*4+1])*in4f[x*4+1] + pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2206                         d[2] = (int)((255.0f-pixel[x*4+0])*in4f[x*4+2] + pixel[x*4+0]);if (d[2] > 255) d[2] = 255;
2207                         d[3] = (int)((255.0f-pixel[x*4+3])*in4f[x*4+3] + pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2208                         pixel[x*4+0] = d[0];
2209                         pixel[x*4+1] = d[1];
2210                         pixel[x*4+2] = d[2];
2211                         pixel[x*4+3] = d[3];
2212                 }
2213                 break;
2214         }
2215 }
2216
2217 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2218 {
2219 #ifdef SSE_POSSIBLE
2220         int x;
2221         int startx = span->startx;
2222         int endx = span->endx;
2223         int subx;
2224         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2225         unsigned char * RESTRICT pixelmask = span->pixelmask;
2226         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2227         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2228         if (!pixel)
2229                 return;
2230         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2231         pixeli += span->y * dpsoftrast.fb_width + span->x;
2232         // handle alphatest now (this affects depth writes too)
2233         if (thread->alphatest)
2234                 for (x = startx;x < endx;x++)
2235                         if (in4ub[x*4+3] < 128)
2236                                 pixelmask[x] = false;
2237         // LordHavoc: clear pixelmask for some pixels in alphablend cases, this
2238         // helps sprites, text and hud artwork
2239         switch(thread->fb_blendmode)
2240         {
2241         case DPSOFTRAST_BLENDMODE_ALPHA:
2242         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2243         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2244                 for (x = startx;x < endx;x++)
2245                         if (in4ub[x*4+3] < 1)
2246                                 pixelmask[x] = false;
2247                 break;
2248         case DPSOFTRAST_BLENDMODE_OPAQUE:
2249         case DPSOFTRAST_BLENDMODE_ADD:
2250         case DPSOFTRAST_BLENDMODE_INVMOD:
2251         case DPSOFTRAST_BLENDMODE_MUL:
2252         case DPSOFTRAST_BLENDMODE_MUL2:
2253         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2254         case DPSOFTRAST_BLENDMODE_INVADD:
2255                 break;
2256         }
2257         // put some special values at the end of the mask to ensure the loops end
2258         pixelmask[endx] = 1;
2259         pixelmask[endx+1] = 0;
2260         // LordHavoc: use a double loop to identify subspans, this helps the
2261         // optimized copy/blend loops to perform at their best, most triangles
2262         // have only one run of pixels, and do the search using wide reads...
2263         x = startx;
2264         while (x < endx)
2265         {
2266                 // if this pixel is masked off, it's probably not alone...
2267                 if (!pixelmask[x])
2268                 {
2269                         x++;
2270 #if 1
2271                         if (x + 8 < endx)
2272                         {
2273                                 // the 4-item search must be aligned or else it stalls badly
2274                                 if ((x & 3) && !pixelmask[x]) x++;
2275                                 if ((x & 3) && !pixelmask[x]) x++;
2276                                 if ((x & 3) && !pixelmask[x]) x++;
2277                                 while (*((unsigned int *)pixelmask + x) == 0x00000000)
2278                                         x += 4;
2279                         }
2280 #endif
2281                         for (;!pixelmask[x];x++)
2282                                 ;
2283                         // rather than continue the loop, just check the end variable
2284                         if (x >= endx)
2285                                 break;
2286                 }
2287                 // find length of subspan
2288                 subx = x + 1;
2289 #if 1
2290                 if (x + 8 < endx)
2291                 {
2292                         if ((subx & 3) && pixelmask[subx]) subx++;
2293                         if ((subx & 3) && pixelmask[subx]) subx++;
2294                         if ((subx & 3) && pixelmask[subx]) subx++;
2295                         while (*((unsigned int *)pixelmask + subx) == 0x01010101)
2296                                 subx += 4;
2297                 }
2298 #endif
2299                 for (;pixelmask[subx];subx++)
2300                         ;
2301                 // the checks can overshoot, so make sure to clip it...
2302                 if (subx > endx)
2303                         subx = endx;
2304                 // now that we know the subspan length...  process!
2305                 switch(thread->fb_blendmode)
2306                 {
2307                 case DPSOFTRAST_BLENDMODE_OPAQUE:
2308 #if 0
2309                         if (subx - x >= 16)
2310                         {
2311                                 memcpy(pixeli + x, ini + x, (subx - x) * sizeof(pixeli[x]));
2312                                 x = subx;
2313                         }
2314                         else
2315 #elif 1
2316                         while (x + 16 <= subx)
2317                         {
2318                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2319                                 _mm_storeu_si128((__m128i *)&pixeli[x+4], _mm_loadu_si128((const __m128i *)&ini[x+4]));
2320                                 _mm_storeu_si128((__m128i *)&pixeli[x+8], _mm_loadu_si128((const __m128i *)&ini[x+8]));
2321                                 _mm_storeu_si128((__m128i *)&pixeli[x+12], _mm_loadu_si128((const __m128i *)&ini[x+12]));
2322                                 x += 16;
2323                         }
2324 #endif
2325                         {
2326                                 while (x + 4 <= subx)
2327                                 {
2328                                         _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2329                                         x += 4;
2330                                 }
2331                                 if (x + 2 <= subx)
2332                                 {
2333                                         pixeli[x] = ini[x];
2334                                         pixeli[x+1] = ini[x+1];
2335                                         x += 2;
2336                                 }
2337                                 if (x < subx)
2338                                 {
2339                                         pixeli[x] = ini[x];
2340                                         x++;
2341                                 }
2342                         }
2343                         break;
2344                 case DPSOFTRAST_BLENDMODE_ALPHA:
2345                 #define FINISHBLEND(blend2, blend1) \
2346                         for (;x + 1 < subx;x += 2) \
2347                         { \
2348                                 __m128i src, dst; \
2349                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2350                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2351                                 blend2; \
2352                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2353                         } \
2354                         if (x < subx) \
2355                         { \
2356                                 __m128i src, dst; \
2357                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2358                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2359                                 blend1; \
2360                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2361                                 x++; \
2362                         }
2363                         FINISHBLEND({
2364                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2365                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2366                         }, {
2367                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2368                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2369                         });
2370                         break;
2371                 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2372                         FINISHBLEND({
2373                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2374                                 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2375                         }, {
2376                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2377                                 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2378                         });
2379                         break;
2380                 case DPSOFTRAST_BLENDMODE_ADD:
2381                         FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2382                         break;
2383                 case DPSOFTRAST_BLENDMODE_INVMOD:
2384                         FINISHBLEND({
2385                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2386                         }, {
2387                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2388                         });
2389                         break;
2390                 case DPSOFTRAST_BLENDMODE_MUL:
2391                         FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2392                         break;
2393                 case DPSOFTRAST_BLENDMODE_MUL2:
2394                         FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2395                         break;
2396                 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2397                         FINISHBLEND({
2398                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2399                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2400                         }, {
2401                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2402                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2403                         });
2404                         break;
2405                 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2406                         FINISHBLEND({
2407                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2408                                 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2409                         }, {
2410                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2411                                 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2412                         });
2413                         break;
2414                 case DPSOFTRAST_BLENDMODE_INVADD:
2415                         FINISHBLEND({
2416                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2417                         }, {
2418                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2419                         });
2420                         break;
2421                 }
2422         }
2423 #endif
2424 }
2425
2426 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2427 {
2428         int x;
2429         int startx = span->startx;
2430         int endx = span->endx;
2431         int flags;
2432         float c[4];
2433         float data[4];
2434         float slope[4];
2435         float tc[2], endtc[2];
2436         float tcscale[2];
2437         unsigned int tci[2];
2438         unsigned int tci1[2];
2439         unsigned int tcimin[2];
2440         unsigned int tcimax[2];
2441         int tciwrapmask[2];
2442         int tciwidth;
2443         int filter;
2444         int mip;
2445         const unsigned char * RESTRICT pixelbase;
2446         const unsigned char * RESTRICT pixel[4];
2447         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2448         // if no texture is bound, just fill it with white
2449         if (!texture)
2450         {
2451                 for (x = startx;x < endx;x++)
2452                 {
2453                         out4f[x*4+0] = 1.0f;
2454                         out4f[x*4+1] = 1.0f;
2455                         out4f[x*4+2] = 1.0f;
2456                         out4f[x*4+3] = 1.0f;
2457                 }
2458                 return;
2459         }
2460         mip = triangle->mip[texunitindex];
2461         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2462         // if this mipmap of the texture is 1 pixel, just fill it with that color
2463         if (texture->mipmap[mip][1] == 4)
2464         {
2465                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2466                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2467                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2468                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2469                 for (x = startx;x < endx;x++)
2470                 {
2471                         out4f[x*4+0] = c[0];
2472                         out4f[x*4+1] = c[1];
2473                         out4f[x*4+2] = c[2];
2474                         out4f[x*4+3] = c[3];
2475                 }
2476                 return;
2477         }
2478         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2479         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2480         flags = texture->flags;
2481         tcscale[0] = texture->mipmap[mip][2];
2482         tcscale[1] = texture->mipmap[mip][3];
2483         tciwidth = texture->mipmap[mip][2];
2484         tcimin[0] = 0;
2485         tcimin[1] = 0;
2486         tcimax[0] = texture->mipmap[mip][2]-1;
2487         tcimax[1] = texture->mipmap[mip][3]-1;
2488         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2489         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2490         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
2491         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
2492         if (filter)
2493         {
2494                 endtc[0] -= 0.5f;
2495                 endtc[1] -= 0.5f;
2496         }
2497         for (x = startx;x < endx;)
2498         {
2499                 unsigned int subtc[2];
2500                 unsigned int substep[2];
2501                 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2502                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2503                 if (nextsub >= endx)
2504                 {
2505                         nextsub = endsub = endx-1;      
2506                         if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2507                 }
2508                 tc[0] = endtc[0];
2509                 tc[1] = endtc[1];
2510                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2511                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2512                 if (filter)
2513                 {
2514                         endtc[0] -= 0.5f;
2515                         endtc[1] -= 0.5f;
2516                 }
2517                 substep[0] = (endtc[0] - tc[0]) * subscale;
2518                 substep[1] = (endtc[1] - tc[1]) * subscale;
2519                 subtc[0] = tc[0] * (1<<12);
2520                 subtc[1] = tc[1] * (1<<12);
2521                 if (filter)
2522                 {
2523                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2524                         {
2525                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2526                                 {
2527                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2528                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2529                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2530                                         tci[0] = subtc[0]>>12;
2531                                         tci[1] = subtc[1]>>12;
2532                                         tci1[0] = tci[0] + 1;
2533                                         tci1[1] = tci[1] + 1;
2534                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2535                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2536                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2537                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2538                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2539                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2540                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2541                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2542                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2543                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2544                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2545                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2546                                         out4f[x*4+0] = c[0];
2547                                         out4f[x*4+1] = c[1];
2548                                         out4f[x*4+2] = c[2];
2549                                         out4f[x*4+3] = c[3];
2550                                 }
2551                         }
2552                         else
2553                         {
2554                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2555                                 {
2556                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2557                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2558                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2559                                         tci[0] = subtc[0]>>12;
2560                                         tci[1] = subtc[1]>>12;
2561                                         tci1[0] = tci[0] + 1;
2562                                         tci1[1] = tci[1] + 1;
2563                                         tci[0] &= tciwrapmask[0];
2564                                         tci[1] &= tciwrapmask[1];
2565                                         tci1[0] &= tciwrapmask[0];
2566                                         tci1[1] &= tciwrapmask[1];
2567                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2568                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2569                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2570                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2571                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2572                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2573                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2574                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2575                                         out4f[x*4+0] = c[0];
2576                                         out4f[x*4+1] = c[1];
2577                                         out4f[x*4+2] = c[2];
2578                                         out4f[x*4+3] = c[3];
2579                                 }
2580                         }
2581                 }
2582                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2583                 {
2584                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2585                         {
2586                                 tci[0] = subtc[0]>>12;
2587                                 tci[1] = subtc[1]>>12;
2588                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2589                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2590                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2591                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2592                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2593                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2594                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2595                                 out4f[x*4+0] = c[0];
2596                                 out4f[x*4+1] = c[1];
2597                                 out4f[x*4+2] = c[2];
2598                                 out4f[x*4+3] = c[3];
2599                         }
2600                 }
2601                 else
2602                 {
2603                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2604                         {
2605                                 tci[0] = subtc[0]>>12;
2606                                 tci[1] = subtc[1]>>12;
2607                                 tci[0] &= tciwrapmask[0];
2608                                 tci[1] &= tciwrapmask[1];
2609                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2610                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2611                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2612                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2613                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2614                                 out4f[x*4+0] = c[0];
2615                                 out4f[x*4+1] = c[1];
2616                                 out4f[x*4+2] = c[2];
2617                                 out4f[x*4+3] = c[3];
2618                         }
2619                 }
2620         }
2621 }
2622
2623 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2624 {
2625 #ifdef SSE_POSSIBLE
2626         int x;
2627         int startx = span->startx;
2628         int endx = span->endx;
2629         int flags;
2630         __m128 data, slope, tcscale;
2631         __m128i tcsize, tcmask, tcoffset, tcmax;
2632         __m128 tc, endtc;
2633         __m128i subtc, substep, endsubtc;
2634         int filter;
2635         int mip;
2636         int affine; // LordHavoc: optimized affine texturing case
2637         unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2638         const unsigned char * RESTRICT pixelbase;
2639         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2640         // if no texture is bound, just fill it with white
2641         if (!texture)
2642         {
2643                 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2644                 return;
2645         }
2646         mip = triangle->mip[texunitindex];
2647         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2648         // if this mipmap of the texture is 1 pixel, just fill it with that color
2649         if (texture->mipmap[mip][1] == 4)
2650         {
2651                 unsigned int k = *((const unsigned int *)pixelbase);
2652                 for (x = startx;x < endx;x++)
2653                         outi[x] = k;
2654                 return;
2655         }
2656         affine = zf[startx] == zf[endx-1];
2657         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2658         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2659         flags = texture->flags;
2660         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2661         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2662         tcscale = _mm_cvtepi32_ps(tcsize);
2663         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2664         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2665         endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2666         if (filter)
2667                 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2668         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2669         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2670         tcmax = _mm_packs_epi32(tcmask, tcmask);
2671         for (x = startx;x < endx;)
2672         {
2673                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2674                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2675                 if (nextsub >= endx || affine)
2676                 {
2677                         nextsub = endsub = endx-1;
2678                         if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2679                 }       
2680                 tc = endtc;
2681                 subtc = endsubtc;
2682                 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2683                 if (filter)
2684                         endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2685                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2686                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2687                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2688                 substep = _mm_slli_epi32(substep, 1);
2689                 if (filter)
2690                 {
2691                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2692                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2693                         {
2694                                 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2695                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2696                                 {
2697                                         const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;                   
2698                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2699                                         tci = _mm_madd_epi16(tci, tcoffset);
2700                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2701                                         ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2702                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2703                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2704                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2705                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2706                                         fracm = _mm_srli_epi16(subtc, 1);
2707                                         pix1 = _mm_add_epi16(pix1,
2708                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2709                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2710                                         pix3 = _mm_add_epi16(pix3,
2711                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2712                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2713                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2714                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2715                                         pix2 = _mm_add_epi16(pix2,
2716                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2717                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2718                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2719                                 }
2720                                 if (x <= endsub)
2721                                 {
2722                                         const unsigned char * RESTRICT ptr1;
2723                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2724                                         tci = _mm_madd_epi16(tci, tcoffset);
2725                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2726                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2727                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2728                                         fracm = _mm_srli_epi16(subtc, 1);
2729                                         pix1 = _mm_add_epi16(pix1,
2730                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2731                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2732                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2733                                         pix1 = _mm_add_epi16(pix1,
2734                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2735                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2736                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2737                                         x++;
2738                                 }
2739                         }
2740                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2741                         {
2742                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2743                                 {
2744                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2745                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2746                                         tci = _mm_madd_epi16(tci, tcoffset);
2747                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2748                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2749                                                                                         _mm_setzero_si128());
2750                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2751                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2752                                                                                         _mm_setzero_si128());
2753                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2754                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2755                                         tci = _mm_madd_epi16(tci, tcoffset);
2756                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2757                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2758                                                                                         _mm_setzero_si128());
2759                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2760                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2761                                                                                         _mm_setzero_si128());
2762                                         fracm = _mm_srli_epi16(subtc, 1);
2763                                         pix1 = _mm_add_epi16(pix1,
2764                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2765                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2766                                         pix3 = _mm_add_epi16(pix3,
2767                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2768                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2769                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2770                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2771                                         pix2 = _mm_add_epi16(pix2,
2772                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2773                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2774                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2775                                 }
2776                                 if (x <= endsub)
2777                                 {
2778                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2779                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2780                                         tci = _mm_madd_epi16(tci, tcoffset);
2781                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2782                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2783                                                                                         _mm_setzero_si128());
2784                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2785                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2786                                                                                         _mm_setzero_si128());
2787                                         fracm = _mm_srli_epi16(subtc, 1);
2788                                         pix1 = _mm_add_epi16(pix1,
2789                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2790                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2791                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2792                                         pix1 = _mm_add_epi16(pix1,
2793                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2794                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2795                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2796                                         x++;
2797                                 }
2798                         }
2799                         else
2800                         {
2801                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2802                                 {
2803                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2804                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2805                                         tci = _mm_madd_epi16(tci, tcoffset);
2806                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2807                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2808                                                                                         _mm_setzero_si128());
2809                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2810                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2811                                                                                         _mm_setzero_si128());
2812                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2813                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2814                                         tci = _mm_madd_epi16(tci, tcoffset);
2815                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2816                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2817                                                                                         _mm_setzero_si128());
2818                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2819                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2820                                                                                         _mm_setzero_si128());
2821                                         fracm = _mm_srli_epi16(subtc, 1);
2822                                         pix1 = _mm_add_epi16(pix1,
2823                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2824                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2825                                         pix3 = _mm_add_epi16(pix3,
2826                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2827                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2828                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2829                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2830                                         pix2 = _mm_add_epi16(pix2,
2831                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2832                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2833                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2834                                 }
2835                                 if (x <= endsub)
2836                                 {
2837                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2838                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2839                                         tci = _mm_madd_epi16(tci, tcoffset);
2840                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2841                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2842                                                                                         _mm_setzero_si128());
2843                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2844                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2845                                                                                         _mm_setzero_si128());
2846                                         fracm = _mm_srli_epi16(subtc, 1);
2847                                         pix1 = _mm_add_epi16(pix1,
2848                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2849                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2850                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2851                                         pix1 = _mm_add_epi16(pix1,
2852                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2853                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2854                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2855                                         x++;
2856                                 }
2857                         }
2858                 }
2859                 else
2860                 {
2861                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2862                         {
2863                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2864                                 {
2865                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2866                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2867                                         tci = _mm_madd_epi16(tci, tcoffset);
2868                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2869                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2870                                 }
2871                                 if (x <= endsub)
2872                                 {
2873                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2874                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2875                                         tci = _mm_madd_epi16(tci, tcoffset);
2876                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2877                                         x++;
2878                                 }
2879                         }
2880                         else
2881                         {
2882                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2883                                 {
2884                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2885                                         tci = _mm_and_si128(tci, tcmax); 
2886                                         tci = _mm_madd_epi16(tci, tcoffset);
2887                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2888                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2889                                 }
2890                                 if (x <= endsub)
2891                                 {
2892                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2893                                         tci = _mm_and_si128(tci, tcmax); 
2894                                         tci = _mm_madd_epi16(tci, tcoffset);
2895                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2896                                         x++;
2897                                 }
2898                         }
2899                 }
2900         }
2901 #endif
2902 }
2903
2904 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2905 {
2906         // TODO: IMPLEMENT
2907         memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2908 }
2909
2910 float DPSOFTRAST_SampleShadowmap(const float *vector)
2911 {
2912         // TODO: IMPLEMENT
2913         return 1.0f;
2914 }
2915
2916 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2917 {
2918         int x;
2919         int startx = span->startx;
2920         int endx = span->endx;
2921         float c[4];
2922         float data[4];
2923         float slope[4];
2924         float z;
2925         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2926         for (x = startx;x < endx;x++)
2927         {
2928                 z = zf[x];
2929                 c[0] = (data[0] + slope[0]*x) * z;
2930                 c[1] = (data[1] + slope[1]*x) * z;
2931                 c[2] = (data[2] + slope[2]*x) * z;
2932                 c[3] = (data[3] + slope[3]*x) * z;
2933                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2934                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2935                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2936                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2937         }
2938 }
2939
2940 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2941 {
2942         int x;
2943         int startx = span->startx;
2944         int endx = span->endx;
2945         float c[4];
2946         float data[4];
2947         float slope[4];
2948         float z;
2949         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2950         for (x = startx;x < endx;x++)
2951         {
2952                 z = zf[x];
2953                 c[0] = (data[0] + slope[0]*x) * z;
2954                 c[1] = (data[1] + slope[1]*x) * z;
2955                 c[2] = (data[2] + slope[2]*x) * z;
2956                 c[3] = (data[3] + slope[3]*x) * z;
2957                 out4f[x*4+0] = c[0];
2958                 out4f[x*4+1] = c[1];
2959                 out4f[x*4+2] = c[2];
2960                 out4f[x*4+3] = c[3];
2961         }
2962 }
2963
2964 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2965 {
2966         int x, startx = span->startx, endx = span->endx;
2967         float c[4], localcolor[4];
2968         localcolor[0] = subcolor[0];
2969         localcolor[1] = subcolor[1];
2970         localcolor[2] = subcolor[2];
2971         localcolor[3] = subcolor[3];
2972         for (x = startx;x < endx;x++)
2973         {
2974                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2975                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2976                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2977                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2978                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2979                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2980                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2981                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2982         }
2983 }
2984
2985 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2986 {
2987         int x, startx = span->startx, endx = span->endx;
2988         for (x = startx;x < endx;x++)
2989         {
2990                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2991                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2992                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2993                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2994         }
2995 }
2996
2997 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2998 {
2999         int x, startx = span->startx, endx = span->endx;
3000         for (x = startx;x < endx;x++)
3001         {
3002                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
3003                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
3004                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
3005                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
3006         }
3007 }
3008
3009 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
3010 {
3011         int x, startx = span->startx, endx = span->endx;
3012         float a, b;
3013         for (x = startx;x < endx;x++)
3014         {
3015                 a = 1.0f - inb4f[x*4+3];
3016                 b = inb4f[x*4+3];
3017                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
3018                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
3019                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
3020                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
3021         }
3022 }
3023
3024 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
3025 {
3026         int x, startx = span->startx, endx = span->endx;
3027         float localcolor[4], ilerp, lerp;
3028         localcolor[0] = color[0];
3029         localcolor[1] = color[1];
3030         localcolor[2] = color[2];
3031         localcolor[3] = color[3];
3032         ilerp = 1.0f - localcolor[3];
3033         lerp = localcolor[3];
3034         for (x = startx;x < endx;x++)
3035         {
3036                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
3037                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
3038                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
3039                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
3040         }
3041 }
3042
3043
3044
3045 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
3046 {
3047 #ifdef SSE_POSSIBLE
3048         int x;
3049         int startx = span->startx;
3050         int endx = span->endx;
3051         __m128 data, slope;
3052         __m128 mod, endmod;
3053         __m128i submod, substep, endsubmod;
3054         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3055         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3056         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3057         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3058         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3059         for (x = startx; x < endx;)
3060         {
3061                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3062                 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3063                 if (nextsub >= endx)
3064                 {
3065                         nextsub = endsub = endx-1;
3066                         if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
3067                 }
3068                 mod = endmod;
3069                 submod = endsubmod;
3070                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3071                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3072                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3073                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3074                 substep = _mm_packs_epi32(substep, substep);
3075                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3076                 {
3077                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
3078                         pix = _mm_mulhi_epu16(pix, submod);
3079                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3080                 }
3081                 if (x <= endsub)
3082                 {
3083                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
3084                         pix = _mm_mulhi_epu16(pix, submod);
3085                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3086                         x++;
3087                 }
3088         }
3089 #endif
3090 }
3091
3092 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
3093 {
3094 #ifdef SSE_POSSIBLE
3095         int x;
3096         int startx = span->startx;
3097         int endx = span->endx;
3098         __m128 data, slope;
3099         __m128 mod, endmod;
3100         __m128i submod, substep, endsubmod;
3101         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3102         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3103         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3104         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3105         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3106         for (x = startx; x < endx;)
3107         {
3108                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3109                 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3110                 if (nextsub >= endx)
3111                 {
3112                         nextsub = endsub = endx-1;
3113                         if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
3114                 }
3115                 mod = endmod;
3116                 submod = endsubmod;
3117                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3118                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3119                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3120                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3121                 substep = _mm_packs_epi32(substep, substep);
3122                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3123                 {
3124                         __m128i pix = _mm_srai_epi16(submod, 4);
3125                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3126                 }
3127                 if (x <= endsub)
3128                 {
3129                         __m128i pix = _mm_srai_epi16(submod, 4);
3130                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3131                         x++;
3132                 }
3133         }
3134 #endif
3135 }
3136
3137 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3138 {
3139 #ifdef SSE_POSSIBLE
3140         int x, startx = span->startx, endx = span->endx;
3141         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3142         localcolor = _mm_packs_epi32(localcolor, localcolor);
3143         for (x = startx;x+2 <= endx;x+=2)
3144         {
3145                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3146                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3147                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3148                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3149         }
3150         if (x < endx)
3151         {
3152                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3153                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3154                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3155                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3156         }
3157 #endif
3158 }
3159
3160 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3161 {
3162 #ifdef SSE_POSSIBLE
3163         int x, startx = span->startx, endx = span->endx;
3164         for (x = startx;x+2 <= endx;x+=2)
3165         {
3166                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3167                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3168                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3169                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3170         }
3171         if (x < endx)
3172         {
3173                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3174                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3175                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3176                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3177         }
3178 #endif
3179 }
3180
3181 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3182 {
3183 #ifdef SSE_POSSIBLE
3184         int x, startx = span->startx, endx = span->endx;
3185         for (x = startx;x+2 <= endx;x+=2)
3186         {
3187                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3188                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3189                 pix1 = _mm_add_epi16(pix1, pix2);
3190                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3191         }
3192         if (x < endx)
3193         {
3194                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3195                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3196                 pix1 = _mm_add_epi16(pix1, pix2);
3197                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3198         }
3199 #endif
3200 }
3201
3202 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3203 {
3204 #ifdef SSE_POSSIBLE
3205         int x, startx = span->startx, endx = span->endx;
3206         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3207         tint = _mm_packs_epi32(tint, tint);
3208         for (x = startx;x+2 <= endx;x+=2)
3209         {
3210                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3211                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3212                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3213                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3214         }
3215         if (x < endx)
3216         {
3217                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3218                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3219                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3220                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3221         }
3222 #endif
3223 }
3224
3225 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3226 {
3227 #ifdef SSE_POSSIBLE
3228         int x, startx = span->startx, endx = span->endx;
3229         for (x = startx;x+2 <= endx;x+=2)
3230         {
3231                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3232                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3233                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3234                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3235                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3236         }
3237         if (x < endx)
3238         {
3239                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3240                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3241                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3242                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3243                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3244         }
3245 #endif
3246 }
3247
3248 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3249 {
3250 #ifdef SSE_POSSIBLE
3251         int x, startx = span->startx, endx = span->endx;
3252         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3253         localcolor = _mm_packs_epi32(localcolor, localcolor);
3254         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3255         for (x = startx;x+2 <= endx;x+=2)
3256         {
3257                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3258                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3259                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3260         }
3261         if (x < endx)
3262         {
3263                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3264                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3265                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3266         }
3267 #endif
3268 }
3269
3270
3271
3272 void DPSOFTRAST_VertexShader_Generic(void)
3273 {
3274         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3275         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3276         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3277         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3278                 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3279 }
3280
3281 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3282 {
3283         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3284         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3285         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3286         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3287         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3288         if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3289         {
3290                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3291                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3292                 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3293                 {
3294                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3295                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3296                         {
3297                                 // multiply
3298                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3299                         }
3300                         else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3301                         {
3302                                 // add
3303                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3304                         }
3305                         else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3306                         {
3307                                 // alphablend
3308                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3309                         }
3310                 }
3311         }
3312         else
3313                 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3314         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3315 }
3316
3317
3318
3319 void DPSOFTRAST_VertexShader_PostProcess(void)
3320 {
3321         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3322         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3323         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3324 }
3325
3326 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3327 {
3328         // TODO: optimize!!  at the very least there is no reason to use texture sampling on the frame texture
3329         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3330         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3331         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3332         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3333         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3334         if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3335         {
3336                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3337                 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3338         }
3339         DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3340         if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3341         {
3342                 // TODO: implement saturation
3343         }
3344         if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3345         {
3346                 // TODO: implement gammaramps
3347         }
3348         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3349 }
3350
3351
3352
3353 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3354 {
3355         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3356 }
3357
3358 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3359 {
3360         // this is never called (because colormask is off when this shader is used)
3361         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3362         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3363         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3364         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3365         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3366 }
3367
3368
3369
3370 void DPSOFTRAST_VertexShader_FlatColor(void)
3371 {
3372         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3373         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3374 }
3375
3376 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3377 {
3378 #ifdef SSE_POSSIBLE
3379         unsigned char * RESTRICT pixelmask = span->pixelmask;
3380         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3381         int x, startx = span->startx, endx = span->endx;
3382         __m128i Color_Ambientm;
3383         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3384         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3385         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3386         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3387         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3388         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3389                 pixel = buffer_FragColorbgra8;
3390         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3391         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3392         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3393         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3394         for (x = startx;x < endx;x++)
3395         {
3396                 __m128i color, pix;
3397                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3398                 {
3399                         __m128i pix2;
3400                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3401                         pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3402                         pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3403                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3404                         x += 3;
3405                         continue;
3406                 }
3407                 if (!pixelmask[x])
3408                         continue;
3409                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3410                 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3411                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3412         }
3413         if (pixel == buffer_FragColorbgra8)
3414                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3415 #endif
3416 }
3417
3418
3419
3420 void DPSOFTRAST_VertexShader_VertexColor(void)
3421 {
3422         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3423         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3424         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3425 }
3426
3427 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3428 {
3429 #ifdef SSE_POSSIBLE
3430         unsigned char * RESTRICT pixelmask = span->pixelmask;
3431         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3432         int x, startx = span->startx, endx = span->endx;
3433         __m128i Color_Ambientm, Color_Diffusem;
3434         __m128 data, slope;
3435         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3436         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3437         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3438         int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3439         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3440         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3441         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3442                 pixel = buffer_FragColorbgra8;
3443         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3444         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3445         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3446         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3447         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3448         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3449         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3450         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3451         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3452         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3453         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3454         data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3455         slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3456         for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3457         {
3458                 __m128i color, mod, pix;
3459                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3460                 {
3461                         __m128i pix2, mod2;
3462                         __m128 z = _mm_loadu_ps(&buffer_z[x]);
3463                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3464                         mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3465                         data = _mm_add_ps(data, slope);
3466                         mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3467                         data = _mm_add_ps(data, slope);
3468                         mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3469                         data = _mm_add_ps(data, slope);
3470                         mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3471                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3472                                                                   _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3473                         pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3474                                                                    _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3475                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3476                         x += 3;
3477                         continue;
3478                 }
3479                 if (!pixelmask[x])
3480                         continue;
3481                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3482                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
3483                 mod = _mm_packs_epi32(mod, mod);
3484                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3485                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3486         }
3487         if (pixel == buffer_FragColorbgra8)
3488                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3489 #endif
3490 }
3491
3492
3493
3494 void DPSOFTRAST_VertexShader_Lightmap(void)
3495 {
3496         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3497         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3498         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3499 }
3500
3501 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3502 {
3503 #ifdef SSE_POSSIBLE
3504         unsigned char * RESTRICT pixelmask = span->pixelmask;
3505         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3506         int x, startx = span->startx, endx = span->endx;
3507         __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3508         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3509         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3510         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3511         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3512         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3513         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3514         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3515         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3516         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3517                 pixel = buffer_FragColorbgra8;
3518         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3519         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3520         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3521         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3522         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3523         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3524         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3525         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3526         {
3527                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3528                 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3529                 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3530                 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3531                 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3532                 for (x = startx;x < endx;x++)
3533                 {
3534                         __m128i color, lightmap, glow, pix;
3535                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3536                         {
3537                                 __m128i pix2;
3538                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3539                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3540                                 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3541                                 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3542                                                                                                         _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3543                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3544                                 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3545                                                                                                         _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3546                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3547                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3548                                 x += 3;
3549                                 continue;
3550                         }
3551                         if (!pixelmask[x])
3552                                 continue;
3553                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3554                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3555                         glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3556                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3557                         pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3558                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3559                 }
3560         }
3561         else
3562         {
3563                 for (x = startx;x < endx;x++)
3564                 {
3565                         __m128i color, lightmap, pix;
3566                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3567                         {
3568                                 __m128i pix2;
3569                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3570                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3571                                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3572                                                                           _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3573                                 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3574                                                                            _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3575                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3576                                 x += 3;
3577                                 continue;
3578                         }
3579                         if (!pixelmask[x]) 
3580                                 continue;
3581                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3582                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3583                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3584                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3585                 }
3586         }
3587         if (pixel == buffer_FragColorbgra8)
3588                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3589 #endif
3590 }
3591
3592
3593 void DPSOFTRAST_VertexShader_LightDirection(void);
3594 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3595
3596 void DPSOFTRAST_VertexShader_FakeLight(void)
3597 {
3598         DPSOFTRAST_VertexShader_LightDirection();
3599 }
3600
3601 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3602 {
3603         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3604 }
3605
3606
3607
3608 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3609 {
3610         DPSOFTRAST_VertexShader_LightDirection();
3611         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3612 }
3613
3614 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3615 {
3616         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3617 }
3618
3619
3620
3621 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3622 {
3623         DPSOFTRAST_VertexShader_LightDirection();
3624         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3625 }
3626
3627 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3628 {
3629         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3630 }
3631
3632
3633
3634 void DPSOFTRAST_VertexShader_LightDirection(void)
3635 {
3636         int i;
3637         int numvertices = dpsoftrast.numvertices;
3638         float LightDir[4];
3639         float LightVector[4];
3640         float EyePosition[4];
3641         float EyeVectorModelSpace[4];
3642         float EyeVector[4];
3643         float position[4];
3644         float svector[4];
3645         float tvector[4];
3646         float normal[4];
3647         LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3648         LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3649         LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3650         LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3651         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3652         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3653         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3654         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3655         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3656         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3657         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3658         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3659         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3660         for (i = 0;i < numvertices;i++)
3661         {
3662                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3663                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3664                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3665                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3666                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3667                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3668                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3669                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3670                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3671                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3672                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3673                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3674                 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3675                 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3676                 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3677                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3678                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3679                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3680                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3681                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3682                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3683                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3684                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3685                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3686                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3687                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3688                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3689                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3690                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3691         }
3692         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3693 }
3694
3695 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3696 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3697 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3698 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3699 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3700 #define DPSOFTRAST_Vector3Normalize(v)\
3701 do\
3702 {\
3703         float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3704         if (len)\
3705         {\
3706                 len = 1.0f / len;\
3707                 v[0] *= len;\
3708                 v[1] *= len;\
3709                 v[2] *= len;\
3710         }\
3711 }\
3712 while(0)
3713
3714 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3715 {
3716         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3717         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3718         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3719         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3720         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3721         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3722         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3723         unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3724         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3725         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3726         int x, startx = span->startx, endx = span->endx;
3727         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3728         float LightVectordata[4];
3729         float LightVectorslope[4];
3730         float EyeVectordata[4];
3731         float EyeVectorslope[4];
3732         float VectorSdata[4];
3733         float VectorSslope[4];
3734         float VectorTdata[4];
3735         float VectorTslope[4];
3736         float VectorRdata[4];
3737         float VectorRslope[4];
3738         float z;
3739         float diffusetex[4];
3740         float glosstex[4];
3741         float surfacenormal[4];
3742         float lightnormal[4];
3743         float lightnormal_modelspace[4];
3744         float eyenormal[4];
3745         float specularnormal[4];
3746         float diffuse;
3747         float specular;
3748         float SpecularPower;
3749         int d[4];
3750         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3751         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3752         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3753         Color_Glow[3] = 0.0f;
3754         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3755         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3756         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3757         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3758         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3759         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3760         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3761         Color_Pants[3] = 0.0f;
3762         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3763         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3764         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3765         Color_Shirt[3] = 0.0f;
3766         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3767         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3768         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3769         {
3770                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3771                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3772         }
3773         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3774         {
3775                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3776         }
3777         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3778         {
3779                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3780                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3781                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3782                 Color_Diffuse[3] = 0.0f;
3783                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3784                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3785                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3786                 LightColor[3] = 0.0f;
3787                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3788                 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3789                 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3790                 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3791                 Color_Specular[3] = 0.0f;
3792                 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3793                 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3794                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3795
3796                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3797                 {
3798                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3799                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3800                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3801                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3802                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3803                 }
3804                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3805                 {
3806                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3807                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3808                 }
3809                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3810                 {
3811                         // nothing of this needed
3812                 }
3813                 else
3814                 {
3815                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3816                 }
3817
3818                 for (x = startx;x < endx;x++)
3819                 {
3820                         z = buffer_z[x];
3821                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3822                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3823                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3824                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3825                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3826                         {
3827                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3828                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3829                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3830                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3831                         }
3832                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3833                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3834                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3835                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3836                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3837                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3838                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3839                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3840
3841                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3842                         {
3843                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3844                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3845                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3846                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3847
3848                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3849                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3850                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3851                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3852
3853                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3854                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3855                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3856                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3857
3858                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3859                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3860                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3861                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3862
3863                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3864                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3865
3866                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3867                                 {
3868                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3869                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3870                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3871                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3872                                 }
3873                         }
3874                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3875                         {
3876                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3877                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3878                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3879                                 {
3880                                         float f = 1.0f / 256.0f;
3881                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3882                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3883                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3884                                 }
3885                         }
3886                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3887                         {
3888                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3889                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3890                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3891                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3892
3893                                 LightColor[0] = 1.0;
3894                                 LightColor[1] = 1.0;
3895                                 LightColor[2] = 1.0;
3896                         }
3897                         else
3898                         {
3899                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3900                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3901                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3902                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3903                         }
3904
3905                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3906
3907                         if(thread->shader_exactspecularmath)
3908                         {
3909                                 // reflect lightnormal at surfacenormal, take the negative of that
3910                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3911                                 float f;
3912                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3913                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3914                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3915                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3916
3917                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
3918                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3919                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3920                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3921                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3922
3923                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3924                         }
3925                         else
3926                         {
3927                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3928                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3929                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3930                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3931
3932                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
3933                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
3934                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
3935                                 DPSOFTRAST_Vector3Normalize(specularnormal);
3936
3937                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3938                         }
3939
3940                         specular = pow(specular, SpecularPower * glosstex[3]);
3941                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3942                         {
3943                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3944                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3945                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3946                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3947                         }
3948                         else
3949                         {
3950                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3951                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3952                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3953                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3954                         }
3955
3956                         buffer_FragColorbgra8[x*4+0] = d[0];
3957                         buffer_FragColorbgra8[x*4+1] = d[1];
3958                         buffer_FragColorbgra8[x*4+2] = d[2];
3959                         buffer_FragColorbgra8[x*4+3] = d[3];
3960                 }
3961         }
3962         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3963         {
3964                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3965                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3966                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3967                 Color_Diffuse[3] = 0.0f;
3968                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3969                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3970                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3971                 LightColor[3] = 0.0f;
3972                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3973
3974                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3975                 {
3976                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3977                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3978                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3979                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3980                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3981                 }
3982                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3983                 {
3984                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3985                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3986                 }
3987                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3988                 {
3989                         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3990                 }
3991                 else
3992                 {
3993                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3994                 }
3995
3996                 for (x = startx;x < endx;x++)
3997                 {
3998                         z = buffer_z[x];
3999                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4000                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4001                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4002                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4003                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4004                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4005                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4006                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4007
4008                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
4009                         {
4010                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
4011                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4012                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4013                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4014
4015                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
4016                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
4017                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
4018                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
4019
4020                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
4021                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
4022                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
4023                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
4024
4025                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
4026                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
4027                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
4028                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
4029
4030                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
4031                                 DPSOFTRAST_Vector3Normalize(lightnormal);
4032
4033                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
4034                                 {
4035                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
4036                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
4037                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
4038                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
4039                                 }
4040                         }
4041                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
4042                         {
4043                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4044                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4045                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4046                                 {
4047                                         float f = 1.0f / 256.0f;
4048                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
4049                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
4050                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
4051                                 }
4052                         }
4053                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
4054                         {
4055                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4056                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4057                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4058                                 DPSOFTRAST_Vector3Normalize(lightnormal);
4059
4060                                 LightColor[0] = 1.0;
4061                                 LightColor[1] = 1.0;
4062                                 LightColor[2] = 1.0;
4063                         }
4064                         else
4065                         {
4066                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4067                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4068                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4069                                 DPSOFTRAST_Vector3Normalize(lightnormal);
4070                         }
4071
4072                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4073                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4074                         {
4075                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4076                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4077                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4078                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
4079                         }
4080                         else
4081                         {
4082                                 d[0] = (int)(                                                + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4083                                 d[1] = (int)(                                                + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4084                                 d[2] = (int)(                                                + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4085                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
4086                         }
4087                         buffer_FragColorbgra8[x*4+0] = d[0];
4088                         buffer_FragColorbgra8[x*4+1] = d[1];
4089                         buffer_FragColorbgra8[x*4+2] = d[2];
4090                         buffer_FragColorbgra8[x*4+3] = d[3];
4091                 }
4092         }
4093         else
4094         {
4095                 for (x = startx;x < endx;x++)
4096                 {
4097                         z = buffer_z[x];
4098                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4099                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4100                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4101                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4102
4103                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4104                         {
4105                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4106                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4107                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4108                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4109                         }
4110                         else
4111                         {
4112                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4113                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4114                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4115                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4116                         }
4117                         buffer_FragColorbgra8[x*4+0] = d[0];
4118                         buffer_FragColorbgra8[x*4+1] = d[1];
4119                         buffer_FragColorbgra8[x*4+2] = d[2];
4120                         buffer_FragColorbgra8[x*4+3] = d[3];
4121                 }
4122         }
4123         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4124 }
4125
4126
4127
4128 void DPSOFTRAST_VertexShader_LightSource(void)
4129 {
4130         int i;
4131         int numvertices = dpsoftrast.numvertices;
4132         float LightPosition[4];
4133         float LightVector[4];
4134         float LightVectorModelSpace[4];
4135         float EyePosition[4];
4136         float EyeVectorModelSpace[4];
4137         float EyeVector[4];
4138         float position[4];
4139         float svector[4];
4140         float tvector[4];
4141         float normal[4];
4142         LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4143         LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4144         LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4145         LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4146         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4147         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4148         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4149         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4150         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4151         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4152         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4153         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4154         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4155         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4156         for (i = 0;i < numvertices;i++)
4157         {
4158                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4159                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4160                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4161                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4162                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4163                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4164                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4165                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4166                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4167                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4168                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4169                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4170                 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4171                 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4172                 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4173                 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4174                 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4175                 LightVector[2] = normal[0]  * LightVectorModelSpace[0] + normal[1]  * LightVectorModelSpace[1] + normal[2]  * LightVectorModelSpace[2];
4176                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4177                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4178                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4179                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4180                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4181                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4182                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4183                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4184                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4185                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
4186                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4187                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4188                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4189                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4190         }
4191         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4192         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4193 }
4194
4195 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4196 {
4197 #ifdef SSE_POSSIBLE
4198         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4199         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4200         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4201         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4202         unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4203         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4204         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4205         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4206         int x, startx = span->startx, endx = span->endx;
4207         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4208         float CubeVectordata[4];
4209         float CubeVectorslope[4];
4210         float LightVectordata[4];
4211         float LightVectorslope[4];
4212         float EyeVectordata[4];
4213         float EyeVectorslope[4];
4214         float z;
4215         float diffusetex[4];
4216         float glosstex[4];
4217         float surfacenormal[4];
4218         float lightnormal[4];
4219         float eyenormal[4];
4220         float specularnormal[4];
4221         float diffuse;
4222         float specular;
4223         float SpecularPower;
4224         float CubeVector[4];
4225         float attenuation;
4226         int d[4];
4227         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4228         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4229         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4230         Color_Glow[3] = 0.0f;
4231         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4232         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4233         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4234         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4235         Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4236         Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4237         Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4238         Color_Diffuse[3] = 0.0f;
4239         Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4240         Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4241         Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4242         Color_Specular[3] = 0.0f;
4243         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4244         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4245         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4246         Color_Pants[3] = 0.0f;
4247         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4248         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4249         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4250         Color_Shirt[3] = 0.0f;
4251         LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4252         LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4253         LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4254         LightColor[3] = 0.0f;
4255         SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4256         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4257         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4258         DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4259         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4260         memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4261         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4262         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4263         {
4264                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4265                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4266         }
4267         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4268                 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4269         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4270         {
4271                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4272                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4273                 for (x = startx;x < endx;x++)
4274                 {
4275                         z = buffer_z[x];
4276                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4277                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4278                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4279                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4280                         if (attenuation < 0.01f)
4281                                 continue;
4282                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4283                         {
4284                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4285                                 if (attenuation < 0.01f)
4286                                         continue;
4287                         }
4288
4289                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4290                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4291                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4292                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4293                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4294                         {
4295                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4296                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4297                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4298                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4299                         }
4300                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4301                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4302                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4303                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4304                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4305                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4306                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4307                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4308
4309                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4310                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4311                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4312                         DPSOFTRAST_Vector3Normalize(lightnormal);
4313
4314                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4315
4316                         if(thread->shader_exactspecularmath)
4317                         {
4318                                 // reflect lightnormal at surfacenormal, take the negative of that
4319                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4320                                 float f;
4321                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4322                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4323                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4324                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4325
4326                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
4327                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4328                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4329                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4330                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4331
4332                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4333                         }
4334                         else
4335                         {
4336                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4337                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4338                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4339                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4340
4341                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
4342                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
4343                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
4344                                 DPSOFTRAST_Vector3Normalize(specularnormal);
4345
4346                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4347                         }
4348                         specular = pow(specular, SpecularPower * glosstex[3]);
4349
4350                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4351                         {
4352                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4353                                 attenuation *= (1.0f / 255.0f);
4354                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4355                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4356                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4357                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4358                         }
4359                         else
4360                         {
4361                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4362                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4363                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4364                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4365                         }
4366                         buffer_FragColorbgra8[x*4+0] = d[0];
4367                         buffer_FragColorbgra8[x*4+1] = d[1];
4368                         buffer_FragColorbgra8[x*4+2] = d[2];
4369                         buffer_FragColorbgra8[x*4+3] = d[3];
4370                 }
4371         }
4372         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4373         {
4374                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4375                 for (x = startx;x < endx;x++)
4376                 {
4377                         z = buffer_z[x];
4378                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4379                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4380                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4381                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4382                         if (attenuation < 0.01f)
4383                                 continue;
4384                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4385                         {
4386                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4387                                 if (attenuation < 0.01f)
4388                                         continue;
4389                         }
4390
4391                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4392                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4393                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4394                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4395                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4396                         {
4397                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4398                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4399                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4400                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4401                         }
4402                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4403                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4404                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4405                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4406
4407                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4408                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4409                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4410                         DPSOFTRAST_Vector3Normalize(lightnormal);
4411
4412                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4413                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4414                         {
4415                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4416                                 attenuation *= (1.0f / 255.0f);
4417                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4418                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4419                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4420                                 d[3] = (int)( diffusetex[3]                                                                                                                   );if (d[3] > 255) d[3] = 255;
4421                         }
4422                         else
4423                         {
4424                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4425                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4426                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4427                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4428                         }
4429                         buffer_FragColorbgra8[x*4+0] = d[0];
4430                         buffer_FragColorbgra8[x*4+1] = d[1];
4431                         buffer_FragColorbgra8[x*4+2] = d[2];
4432                         buffer_FragColorbgra8[x*4+3] = d[3];
4433                 }
4434         }
4435         else
4436         {
4437                 for (x = startx;x < endx;x++)
4438                 {
4439                         z = buffer_z[x];
4440                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4441                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4442                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4443                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4444                         if (attenuation < 0.01f)
4445                                 continue;
4446                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4447                         {
4448                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4449                                 if (attenuation < 0.01f)
4450                                         continue;
4451                         }
4452
4453                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4454                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4455                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4456                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4457                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4458                         {
4459                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4460                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4461                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4462                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4463                         }
4464                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4465                         {
4466                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4467                                 attenuation *= (1.0f / 255.0f);
4468                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4469                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4470                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4471                                 d[3] = (int)( diffusetex[3]                                                                                      );if (d[3] > 255) d[3] = 255;
4472                         }
4473                         else
4474                         {
4475                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4476                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4477                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4478                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4479                         }
4480                         buffer_FragColorbgra8[x*4+0] = d[0];
4481                         buffer_FragColorbgra8[x*4+1] = d[1];
4482                         buffer_FragColorbgra8[x*4+2] = d[2];
4483                         buffer_FragColorbgra8[x*4+3] = d[3];
4484                 }
4485         }
4486         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4487 #endif
4488 }
4489
4490
4491
4492 void DPSOFTRAST_VertexShader_Refraction(void)
4493 {
4494         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4495         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4496         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4497 }
4498
4499 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4500 {
4501         // DIRTY TRICK: only do sideways displacement. Not correct, but cheaper and thus better for SW.
4502
4503         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4504         float z;
4505         int x, startx = span->startx, endx = span->endx;
4506
4507         // texture reads
4508         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4509         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4510
4511         // varyings
4512         float ModelViewProjectionPositiondata[4];
4513         float ModelViewProjectionPositionslope[4];
4514
4515         // uniforms
4516         float ScreenScaleRefractReflect[2];
4517         float ScreenCenterRefractReflect[2];
4518         float DistortScaleRefractReflect[2];
4519         float RefractColor[4];
4520
4521         const unsigned char * RESTRICT pixelbase;
4522         const unsigned char * RESTRICT pixel[4];
4523         DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4524         if(!texture) return;
4525         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[0][0];
4526
4527         // read textures
4528         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4529         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4530
4531         // read varyings
4532         DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD1); // or POSITION?
4533
4534         // read uniforms
4535         ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4536         ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4537         ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4538         ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4539         DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4540         DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4541         RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4542         RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4543         RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4544         RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4545
4546         // do stuff
4547         for (x = startx;x < endx;x++)
4548         {
4549                 float SafeScreenTexCoord[2];
4550                 float ScreenTexCoord[2];
4551                 float v[3];
4552                 float iw;
4553                 unsigned char c[4];
4554
4555                 z = buffer_z[x];
4556
4557                 // "    vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4558                 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4559                 
4560                 // "    vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4561                 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4562                 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4563
4564                 // "    vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4565                 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4566                 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4567                 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4568                 DPSOFTRAST_Vector3Normalize(v);
4569                 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4570                 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4571
4572                 // "    dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4573                 if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4574                 {
4575                         unsigned int tc[2] = { ScreenTexCoord[0] * (texture->mipmap[0][2]<<12) - 2048, ScreenTexCoord[1] * (texture->mipmap[0][3]<<12) - 2048};
4576                         unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
4577                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
4578                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
4579                         int tci[2] = { tc[0]>>12, tc[1]>>12 };
4580                         int tci1[2] = { tci[0] + 1, tci[1] + 1 };
4581                         tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4582                         tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4583                         tci1[0] = tci1[0] >= 0 ? (tci1[0] <= texture->mipmap[0][2]-1 ? tci1[0] : texture->mipmap[0][2]-1) : 0;
4584                         tci1[1] = tci1[1] >= 0 ? (tci1[1] <= texture->mipmap[0][3]-1 ? tci1[1] : texture->mipmap[0][3]-1) : 0;
4585                         pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4586                         pixel[1] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci1[0]);
4587                         pixel[2] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci[0]);
4588                         pixel[3] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci1[0]);
4589                         c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
4590                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
4591                         c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
4592                 }
4593                 else
4594                 {
4595                         int tci[2] = { ScreenTexCoord[0] * texture->mipmap[0][2], ScreenTexCoord[1] * texture->mipmap[0][3] };
4596                         tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4597                         tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4598                         pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4599                         c[0] = pixel[0][0];
4600                         c[1] = pixel[0][1];
4601                         c[2] = pixel[0][2];
4602                 }
4603
4604                 //p = (int) bound(startx, x + (ScreenTexCoord[0] - SafeScreenTexCoord[0]) / (ModelViewProjectionPositionslope[0]*z), endx-1);
4605                 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4606                 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4607                 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4608                 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4609         }
4610
4611         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4612 }
4613
4614
4615
4616 void DPSOFTRAST_VertexShader_Water(void)
4617 {
4618         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4619 }
4620
4621
4622 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4623 {
4624         // TODO: IMPLEMENT
4625         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4626         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4627         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4628         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4629         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4630 }
4631
4632
4633
4634 void DPSOFTRAST_VertexShader_ShowDepth(void)
4635 {
4636         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4637 }
4638
4639 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4640 {
4641         // TODO: IMPLEMENT
4642         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4643         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4644         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4645         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4646         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4647 }
4648
4649
4650
4651 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4652 {
4653         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4654 }
4655
4656 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4657 {
4658         // TODO: IMPLEMENT
4659         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4660         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4661         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4662         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4663         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4664 }
4665
4666
4667
4668 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4669 {
4670         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4671 }
4672
4673 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4674 {
4675         // TODO: IMPLEMENT
4676         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4677         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4678         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4679         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4680         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4681 }
4682
4683
4684
4685 typedef struct DPSOFTRAST_ShaderModeInfo_s
4686 {
4687         int lodarrayindex;
4688         void (*Vertex)(void);
4689         void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4690         unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4691         unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4692 }
4693 DPSOFTRAST_ShaderModeInfo;
4694
4695 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4696 {
4697         {2, DPSOFTRAST_VertexShader_Generic,                        DPSOFTRAST_PixelShader_Generic,                        {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4698         {2, DPSOFTRAST_VertexShader_PostProcess,                    DPSOFTRAST_PixelShader_PostProcess,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4699         {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow,                DPSOFTRAST_PixelShader_Depth_Or_Shadow,                {~0}, {~0}},
4700         {2, DPSOFTRAST_VertexShader_FlatColor,                      DPSOFTRAST_PixelShader_FlatColor,                      {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4701         {2, DPSOFTRAST_VertexShader_VertexColor,                    DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4702         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4703         {2, DPSOFTRAST_VertexShader_FakeLight,                      DPSOFTRAST_PixelShader_FakeLight,                      {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4704         {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace,   {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4705         {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4706         {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,                 {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4707         {2, DPSOFTRAST_VertexShader_LightSource,                    DPSOFTRAST_PixelShader_LightSource,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4708         {2, DPSOFTRAST_VertexShader_Refraction,                     DPSOFTRAST_PixelShader_Refraction,                     {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4709         {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                          {~0}},
4710         {2, DPSOFTRAST_VertexShader_ShowDepth,                      DPSOFTRAST_PixelShader_ShowDepth,                      {~0}},
4711         {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,               {~0}},
4712         {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}},
4713 };
4714
4715 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4716 {
4717         int i;
4718         int x;
4719         int startx;
4720         int endx;
4721 //      unsigned int c;
4722 //      unsigned int *colorpixel;
4723         unsigned int *depthpixel;
4724         float w;
4725         float wslope;
4726         int depth;
4727         int depthslope;
4728         unsigned int d;
4729         DPSOFTRAST_State_Triangle *triangle;
4730         DPSOFTRAST_State_Span *span;
4731         unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH+4]; // LordHavoc: padded to allow some termination bytes
4732         for (i = 0; i < thread->numspans; i++)
4733         {
4734                 span = &thread->spans[i];
4735                 triangle = &thread->triangles[span->triangle];
4736                 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4737                 {
4738                         wslope = triangle->w[0];
4739                         w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4740                         depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4741                         depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4742                         depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4743                         startx = span->startx;
4744                         endx = span->endx;
4745                         switch(thread->fb_depthfunc)
4746                         {
4747                         default:
4748                         case GL_ALWAYS:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4749                         case GL_LESS:    for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4750                         case GL_LEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4751                         case GL_EQUAL:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4752                         case GL_GEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4753                         case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4754                         case GL_NEVER:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4755                         }
4756                         //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4757                         //for (x = startx;x < endx;x++)
4758                         //      colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4759                         // if there is no color buffer, skip pixel shader
4760                         while (startx < endx && !pixelmask[startx])
4761                                 startx++;
4762                         while (endx > startx && !pixelmask[endx-1])
4763                                 endx--;
4764                         if (startx >= endx)
4765                                 continue; // no pixels to fill
4766                         span->pixelmask = pixelmask;
4767                         span->startx = startx;
4768                         span->endx = endx;
4769                         // run pixel shader if appropriate
4770                         // do this before running depthmask code, to allow the pixelshader
4771                         // to clear pixelmask values for alpha testing
4772                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4773                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4774                         if (thread->depthmask)
4775                                 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4776                                         if (pixelmask[x])
4777                                                 depthpixel[x] = d;
4778                 }
4779                 else
4780                 {
4781                         // no depth testing means we're just dealing with color...
4782                         // if there is no color buffer, skip pixel shader
4783                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4784                         {
4785                                 memset(pixelmask + span->startx, 1, span->endx - span->startx);
4786                                 span->pixelmask = pixelmask;
4787                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4788                         }
4789                 }
4790         }
4791         thread->numspans = 0;
4792 }
4793
4794 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4795
4796 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4797 {
4798 #ifdef SSE_POSSIBLE
4799         int cullface = thread->cullface;
4800         int minx, maxx, miny, maxy;
4801         int miny1, maxy1, miny2, maxy2;
4802         __m128i fbmin, fbmax;
4803         __m128 viewportcenter, viewportscale;
4804         int firstvertex = command->firstvertex;
4805         int numvertices = command->numvertices;
4806         int numtriangles = command->numtriangles;
4807         const int *element3i = command->element3i;
4808         const unsigned short *element3s = command->element3s;
4809         int clipped = command->clipped;
4810         int i;
4811         int j;
4812         int k;
4813         int y;
4814         int e[3];
4815         __m128i screeny;
4816         int starty, endy, bandy;
4817         int numpoints;
4818         int clipcase;
4819         float clipdist[4];
4820         float clip0origin, clip0slope;
4821         int clip0dir;
4822         __m128 triangleedge1, triangleedge2, trianglenormal;
4823         __m128 clipfrac[3];
4824         __m128 screen[4];
4825         DPSOFTRAST_State_Triangle *triangle;
4826         DPSOFTRAST_Texture *texture;
4827         DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4828         miny = thread->fb_scissor[1];
4829         maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4830         miny1 = bound(miny, thread->miny1, maxy);
4831         maxy1 = bound(miny, thread->maxy1, maxy);
4832         miny2 = bound(miny, thread->miny2, maxy);
4833         maxy2 = bound(miny, thread->maxy2, maxy);
4834         if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4835         {
4836                 if (!ATOMIC_DECREMENT(command->refcount))
4837                 {
4838                         if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4839                                 MM_FREE(command->arrays);
4840                 }
4841                 return;
4842         }
4843         minx = thread->fb_scissor[0];
4844         maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4845         fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4846         fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4847         viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4848         viewportscale = _mm_load_ps(thread->fb_viewportscale);
4849         screen[3] = _mm_setzero_ps();
4850         clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4851         for (i = 0;i < numtriangles;i++)
4852         {
4853                 const float *screencoord4f = command->arrays;
4854                 const float *arrays = screencoord4f + numvertices*4;
4855
4856                 // generate the 3 edges of this triangle
4857                 // generate spans for the triangle - switch based on left split or right split classification of triangle
4858                 if (element3s)
4859                 {
4860                         e[0] = element3s[i*3+0] - firstvertex;
4861                         e[1] = element3s[i*3+1] - firstvertex;
4862                         e[2] = element3s[i*3+2] - firstvertex;
4863                 }
4864                 else if (element3i)
4865                 {
4866                         e[0] = element3i[i*3+0] - firstvertex;
4867                         e[1] = element3i[i*3+1] - firstvertex;
4868                         e[2] = element3i[i*3+2] - firstvertex;
4869                 }
4870                 else
4871                 {
4872                         e[0] = i*3+0;
4873                         e[1] = i*3+1;
4874                         e[2] = i*3+2;
4875                 }
4876
4877 #define SKIPBACKFACE \
4878                 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4879                 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4880                 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4881                 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4882                                                                         _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4883                 switch(cullface) \
4884                 { \
4885                 case GL_BACK: \
4886                         if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4887                                 continue; \
4888                         break; \
4889                 case GL_FRONT: \
4890                         if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4891                                 continue; \
4892                         break; \
4893                 }
4894
4895 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4896                         clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4897                         { \
4898                                 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4899                                 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4900                         }
4901 #define CLIPPEDVERTEXCOPY(k,p1) \
4902                         screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4903
4904 #define GENATTRIBCOPY(attrib, p1) \
4905                 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4906 #define GENATTRIBLERP(attrib, p1, p2) \
4907                 { \
4908                         __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4909                         attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4910                 }
4911 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4912                 switch(clipcase) \
4913                 { \
4914                 default: \
4915                 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4916                 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4917                 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4918                 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4919                 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4920                 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4921                 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4922                 }
4923
4924                 if (! clipped)
4925                         goto notclipped;
4926
4927                 // calculate distance from nearplane
4928                 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4929                 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4930                 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4931                 if (clipdist[0] >= 0.0f)
4932                 {
4933                         if (clipdist[1] >= 0.0f)
4934                         {
4935                                 if (clipdist[2] >= 0.0f)
4936                                 {
4937                                 notclipped:
4938                                         // triangle is entirely in front of nearplane
4939                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4940                                         SKIPBACKFACE;
4941                                         numpoints = 3;
4942                                         clipcase = 0;
4943                                 }
4944                                 else
4945                                 {
4946                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4947                                         SKIPBACKFACE;
4948                                         numpoints = 4;
4949                                         clipcase = 1;
4950                                 }
4951                         }
4952                         else
4953                         {
4954                                 if (clipdist[2] >= 0.0f)
4955                                 {
4956                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4957                                         SKIPBACKFACE;
4958                                         numpoints = 4;
4959                                         clipcase = 2;
4960                                 }
4961                                 else
4962                                 {
4963                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4964                                         SKIPBACKFACE;
4965                                         numpoints = 3;
4966                                         clipcase = 3;
4967                                 }
4968                         }
4969                 }
4970                 else if (clipdist[1] >= 0.0f)
4971                 {
4972                         if (clipdist[2] >= 0.0f)
4973                         {
4974                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4975                                 SKIPBACKFACE;
4976                                 numpoints = 4;
4977                                 clipcase = 4;
4978                         }
4979                         else
4980                         {
4981                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4982                                 SKIPBACKFACE;
4983                                 numpoints = 3;
4984                                 clipcase = 5;
4985                         }
4986                 }
4987                 else if (clipdist[2] >= 0.0f)
4988                 {
4989                         CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4990                         SKIPBACKFACE;
4991                         numpoints = 3;
4992                         clipcase = 6;
4993                 }
4994                 else continue; // triangle is entirely behind nearplane
4995
4996                 {
4997                         // calculate integer y coords for triangle points
4998                         __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4999                                         screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
5000                                         screenmin = _mm_min_epi16(screeni, screenir),
5001                                         screenmax = _mm_max_epi16(screeni, screenir);
5002                         screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
5003                         screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
5004                         screenmin = _mm_max_epi16(screenmin, fbmin);
5005                         screenmax = _mm_min_epi16(screenmax, fbmax);
5006                         // skip offscreen triangles
5007                         if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
5008                                 continue;
5009                         starty = _mm_extract_epi16(screenmin, 1);
5010                         endy = _mm_extract_epi16(screenmax, 1)+1;
5011                         if (starty >= maxy1 && endy <= miny2)
5012                                 continue;
5013                         screeny = _mm_srai_epi32(screeni, 16);
5014                 }
5015
5016                 triangle = &thread->triangles[thread->numtriangles];
5017
5018                 // calculate attribute plans for triangle data...
5019                 // okay, this triangle is going to produce spans, we'd better project
5020                 // the interpolants now (this is what gives perspective texturing),
5021                 // this consists of simply multiplying all arrays by the W coord
5022                 // (which is basically 1/Z), which will be undone per-pixel
5023                 // (multiplying by Z again) to get the perspective-correct array
5024                 // values
5025                 {
5026                         __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
5027                         __m128 mipedgescale, mipdensity;
5028                         attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
5029                         attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
5030                         attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
5031                         attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
5032                         attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
5033                         w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
5034                         w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
5035                         w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
5036                         attribedge1 = _mm_sub_ss(w0, w1);
5037                         attribedge2 = _mm_sub_ss(w2, w1);
5038                         attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5039                         attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5040                         x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
5041                         y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
5042                         attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5043                         _mm_store_ss(&triangle->w[0], attribxslope);
5044                         _mm_store_ss(&triangle->w[1], attribyslope);
5045                         _mm_store_ss(&triangle->w[2], attriborigin);
5046                         
5047                         clip0origin = 0;
5048                         clip0slope = 0;
5049                         clip0dir = 0;
5050                         if(thread->fb_clipplane[0] || thread->fb_clipplane[1] || thread->fb_clipplane[2])
5051                         {
5052                                 float cliporigin, clipxslope, clipyslope;
5053                                 attriborigin = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(2, 2, 2, 2));
5054                                 attribedge1 = _mm_sub_ss(_mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5055                                 attribedge2 = _mm_sub_ss(_mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5056                                 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5057                                 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5058                                 attriborigin = _mm_sub_ss(attriborigin, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5059                                 cliporigin = _mm_cvtss_f32(attriborigin)*thread->fb_clipplane[2] + thread->fb_clipplane[3];
5060                                 clipxslope = thread->fb_clipplane[0] + _mm_cvtss_f32(attribxslope)*thread->fb_clipplane[2];
5061                                 clipyslope = thread->fb_clipplane[1] + _mm_cvtss_f32(attribyslope)*thread->fb_clipplane[2];
5062                                 if(clipxslope != 0)
5063                                 {
5064                                         clip0origin = -cliporigin/clipxslope;
5065                                         clip0slope = -clipyslope/clipxslope;
5066                                         clip0dir = clipxslope > 0 ? 1 : -1;
5067                                 }
5068                                 else if(clipyslope > 0)
5069                                 {
5070                                         clip0origin = dpsoftrast.fb_width*floor(cliporigin/clipyslope);
5071                                         clip0slope = dpsoftrast.fb_width;
5072                                         clip0dir = -1;
5073                                 }
5074                                 else if(clipyslope < 0)
5075                                 {
5076                                         clip0origin = dpsoftrast.fb_width*ceil(cliporigin/clipyslope);
5077                                         clip0slope = -dpsoftrast.fb_width;
5078                                         clip0dir = -1;
5079                                 }
5080                                 else if(clip0origin < 0) continue;
5081                         }
5082
5083                         mipedgescale = _mm_setzero_ps();
5084                         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
5085                         {
5086                                 __m128 attrib0, attrib1, attrib2;
5087                                 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
5088                                 if (k >= DPSOFTRAST_ARRAY_TOTAL)
5089                                         break;
5090                                 arrays += numvertices*4;
5091                                 GENATTRIBS(attrib0, attrib1, attrib2);
5092                                 attriborigin = _mm_mul_ps(attrib1, w1);
5093                                 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
5094                                 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
5095                                 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
5096                                 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
5097                                 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
5098                                 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
5099                                 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
5100                                 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
5101                                 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
5102                                 {
5103                                         mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
5104                                         mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
5105                                         mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
5106                                         mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
5107                                 }
5108                         }
5109
5110                         memset(triangle->mip, 0, sizeof(triangle->mip));
5111                         for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
5112                         {
5113                                 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
5114                                 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
5115                                         break;
5116                                 texture = thread->texbound[texunit];
5117                                 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
5118                                 {
5119                                         mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
5120                                         mipdensity = _mm_mul_ps(mipdensity, mipdensity);
5121                                         mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
5122                                         mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
5123                                         // this will be multiplied in the texturing routine by the texture resolution
5124                                         y = _mm_cvtss_si32(mipdensity);
5125                                         if (y > 0)
5126                                         {
5127                                                 y = (int)(log((float)y)*0.5f/M_LN2);
5128                                                 if (y > texture->mipmaps - 1)
5129                                                         y = texture->mipmaps - 1;
5130                                                 triangle->mip[texunit] = y;
5131                                         }
5132                                 }
5133                         }
5134                 }
5135         
5136                 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
5137                 for (; y < bandy;)
5138                 {
5139                         __m128 xcoords, xslope;
5140                         __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
5141                         int yccmask = _mm_movemask_epi8(ycc);
5142                         int edge0p, edge0n, edge1p, edge1n;
5143                         int nexty;
5144                         float clip0;
5145                         if (numpoints == 4)
5146                         {
5147                                 switch(yccmask)
5148                                 {
5149                                 default:
5150                                 case 0xFFFF: /*0000*/ y = endy; continue;
5151                                 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
5152                                 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5153                                 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
5154                                 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
5155                                 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5156                                 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5157                                 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5158                                 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5159                                 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5160                                 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5161                                 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5162                                 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5163                                 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5164                                 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5165                                 case 0x0000: /*1111*/ y++; continue;
5166                                 }
5167                         }
5168                         else
5169                         {
5170                                 switch(yccmask)
5171                                 {
5172                                 default:
5173                                 case 0xFFFF: /*000*/ y = endy; continue;
5174                                 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5175                                 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5176                                 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5177                                 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5178                                 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5179                                 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5180                                 case 0x0000: /*111*/ y++; continue;
5181                                 }
5182                         }
5183                         ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5184                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5185                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5186                         nexty = _mm_extract_epi16(ycc, 0);
5187                         if (nexty >= bandy) nexty = bandy-1;
5188                         xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5189                         xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5190                         xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5191                                                                 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5192                         xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5193                         if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5194                         {
5195                                 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5196                                 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5197                         }
5198                         clip0 = clip0origin + (y+0.5f)*clip0slope + 0.5f;
5199                         for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope), clip0 += clip0slope)
5200                         {
5201                                 int startx, endx, clipx = minx, offset;
5202                                 startx = _mm_cvtss_si32(xcoords);
5203                                 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5204                                 if (startx < minx) 
5205                                 {
5206                                         if (startx < 0) startx = 0;
5207                                         startx += (minx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
5208                                 }
5209                                 if (endx > maxx) endx = maxx;
5210                                 if (startx >= endx) continue;
5211
5212                                 if (clip0dir)
5213                                 {
5214                                         if (clip0dir > 0)
5215                                         {
5216                                                 if (startx < clip0) 
5217                                                 {
5218                                                         if(endx <= clip0) continue;
5219                                                         clipx = max((int)clip0, minx);
5220                                                         startx += (clipx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1); 
5221                                                 }
5222                                         }
5223                                         else if (endx > clip0) 
5224                                         {
5225                                                 if(startx >= clip0) continue;
5226                                                 endx = (int)clip0;
5227                                         }
5228                                 }
5229                                                 
5230                                 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5231                                 {
5232                                         DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5233                                         span->triangle = thread->numtriangles;
5234                                         span->x = offset;
5235                                         span->y = y;
5236                                         span->startx = max(clipx - offset, 0);
5237                                         span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5238                                         if (span->startx >= span->endx)
5239                                                 continue; 
5240                                         if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5241                                                 DPSOFTRAST_Draw_ProcessSpans(thread);
5242                                 }
5243                         }
5244                 }
5245
5246                 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5247                 {
5248                         DPSOFTRAST_Draw_ProcessSpans(thread);
5249                         thread->numtriangles = 0;
5250                 }
5251         }
5252
5253         if (!ATOMIC_DECREMENT(command->refcount))
5254         {
5255                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5256                         MM_FREE(command->arrays);
5257         }
5258
5259         if (thread->numspans > 0 || thread->numtriangles > 0)
5260         {
5261                 DPSOFTRAST_Draw_ProcessSpans(thread);
5262                 thread->numtriangles = 0;
5263         }
5264 #endif
5265 }
5266
5267 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5268 {
5269         int i;
5270         int j;
5271         int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5272         int datasize = 2*numvertices*sizeof(float[4]);
5273         DPSOFTRAST_Command_Draw *command;
5274         unsigned char *data;
5275         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5276         {
5277                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5278                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5279                         break;
5280                 datasize += numvertices*sizeof(float[4]);
5281         }
5282         if (element3s)
5283                 datasize += numtriangles*sizeof(unsigned short[3]);
5284         else if (element3i)
5285                 datasize += numtriangles*sizeof(int[3]);
5286         datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5287         if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5288         {
5289                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5290                 data = (unsigned char *)MM_CALLOC(datasize, 1);
5291         }
5292         else
5293         {
5294                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5295                 data = (unsigned char *)command + commandsize;
5296         }
5297         command->firstvertex = firstvertex;
5298         command->numvertices = numvertices;
5299         command->numtriangles = numtriangles;
5300         command->arrays = (float *)data;
5301         memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5302         dpsoftrast.firstvertex = firstvertex;
5303         dpsoftrast.numvertices = numvertices;
5304         dpsoftrast.screencoord4f = (float *)data;
5305         data += numvertices*sizeof(float[4]);
5306         dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5307         data += numvertices*sizeof(float[4]);
5308         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5309         {
5310                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5311                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5312                         break;
5313                 dpsoftrast.post_array4f[j] = (float *)data;
5314                 data += numvertices*sizeof(float[4]);
5315         }
5316         command->element3i = NULL;
5317         command->element3s = NULL;
5318         if (element3s)
5319         {
5320                 command->element3s = (unsigned short *)data;
5321                 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5322         }
5323         else if (element3i)
5324         {
5325                 command->element3i = (int *)data;
5326                 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5327         }
5328         return command;
5329 }
5330
5331 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5332 {
5333         DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5334         DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5335         command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5336         command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5337         if (command->starty >= command->endy)
5338         {
5339                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5340                         MM_FREE(command->arrays);
5341                 DPSOFTRAST_UndoCommand(command->commandsize);
5342                 return;
5343         }
5344         command->clipped = dpsoftrast.drawclipped;
5345         command->refcount = dpsoftrast.numthreads;
5346
5347         if (dpsoftrast.usethreads)
5348         {
5349                 int i;
5350                 DPSOFTRAST_Draw_SyncCommands();
5351                 for (i = 0; i < dpsoftrast.numthreads; i++)
5352                 {
5353                         DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5354                         if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5355                                 Thread_CondSignal(thread->drawcond);
5356                 }
5357         }
5358         else
5359         {
5360                 DPSOFTRAST_Draw_FlushThreads();
5361         }
5362 }
5363
5364 DEFCOMMAND(23, SetRenderTargets, int width; int height;);
5365 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5366 {
5367         thread->validate |= DPSOFTRAST_VALIDATE_FB;
5368 }
5369 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5370 {
5371         DPSOFTRAST_Command_SetRenderTargets *command;
5372         if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5373                 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5374                 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5375                 DPSOFTRAST_Flush();
5376         dpsoftrast.fb_width = width;
5377         dpsoftrast.fb_height = height;
5378         dpsoftrast.fb_depthpixels = depthpixels;
5379         dpsoftrast.fb_colorpixels[0] = colorpixels0;
5380         dpsoftrast.fb_colorpixels[1] = colorpixels1;
5381         dpsoftrast.fb_colorpixels[2] = colorpixels2;
5382         dpsoftrast.fb_colorpixels[3] = colorpixels3;
5383         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5384         command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5385         command->width = width;
5386         command->height = height;
5387 }
5388  
5389 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5390 {
5391         int commandoffset = thread->commandoffset;
5392         while (commandoffset != endoffset)
5393         {
5394                 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5395                 switch (command->opcode)
5396                 {
5397 #define INTERPCOMMAND(name) \
5398                 case DPSOFTRAST_OPCODE_##name : \
5399                         DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5400                         commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5401                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5402                                 commandoffset = 0; \
5403                         break;
5404                 INTERPCOMMAND(Viewport)
5405                 INTERPCOMMAND(ClearColor)
5406                 INTERPCOMMAND(ClearDepth)
5407                 INTERPCOMMAND(ColorMask)
5408                 INTERPCOMMAND(DepthTest)
5409                 INTERPCOMMAND(ScissorTest)
5410                 INTERPCOMMAND(Scissor)
5411                 INTERPCOMMAND(BlendFunc)
5412                 INTERPCOMMAND(BlendSubtract)
5413                 INTERPCOMMAND(DepthMask)
5414                 INTERPCOMMAND(DepthFunc)
5415                 INTERPCOMMAND(DepthRange)
5416                 INTERPCOMMAND(PolygonOffset)
5417                 INTERPCOMMAND(CullFace)
5418                 INTERPCOMMAND(AlphaTest)
5419                 INTERPCOMMAND(AlphaFunc)
5420                 INTERPCOMMAND(SetTexture)
5421                 INTERPCOMMAND(SetShader)
5422                 INTERPCOMMAND(Uniform4f)
5423                 INTERPCOMMAND(UniformMatrix4f)
5424                 INTERPCOMMAND(Uniform1i)
5425                 INTERPCOMMAND(SetRenderTargets)
5426                 INTERPCOMMAND(ClipPlane)
5427
5428                 case DPSOFTRAST_OPCODE_Draw:
5429                         DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5430                         commandoffset += command->commandsize;
5431                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5432                                 commandoffset = 0;
5433                         thread->commandoffset = commandoffset;
5434                         break;
5435
5436                 case DPSOFTRAST_OPCODE_Reset:
5437                         commandoffset = 0;
5438                         break;
5439                 }
5440         }
5441         thread->commandoffset = commandoffset;
5442 }
5443
5444 static int DPSOFTRAST_Draw_Thread(void *data)
5445 {
5446         DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5447         while(thread->index >= 0)
5448         {
5449                 if (thread->commandoffset != dpsoftrast.drawcommand)
5450                 {
5451                         DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);      
5452                 }
5453                 else 
5454                 {
5455                         Thread_LockMutex(thread->drawmutex);
5456                         if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5457                         {
5458                                 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5459                                 thread->starving = true;
5460                                 Thread_CondWait(thread->drawcond, thread->drawmutex);
5461                                 thread->starving = false;
5462                         }
5463                         Thread_UnlockMutex(thread->drawmutex);
5464                 }
5465         }   
5466         return 0;
5467 }
5468
5469 static void DPSOFTRAST_Draw_FlushThreads(void)
5470 {
5471         DPSOFTRAST_State_Thread *thread;
5472         int i;
5473         DPSOFTRAST_Draw_SyncCommands();
5474         if (dpsoftrast.usethreads) 
5475         {
5476                 for (i = 0; i < dpsoftrast.numthreads; i++)
5477                 {
5478                         thread = &dpsoftrast.threads[i];
5479                         if (thread->commandoffset != dpsoftrast.drawcommand)
5480                         {
5481                                 Thread_LockMutex(thread->drawmutex);
5482                                 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5483                                         Thread_CondSignal(thread->drawcond);
5484                                 Thread_UnlockMutex(thread->drawmutex);
5485                         }
5486                 }
5487                 for (i = 0; i < dpsoftrast.numthreads; i++)
5488                 {
5489                         thread = &dpsoftrast.threads[i];
5490                         if (thread->commandoffset != dpsoftrast.drawcommand)
5491                         {
5492                                 Thread_LockMutex(thread->drawmutex);
5493                                 if (thread->commandoffset != dpsoftrast.drawcommand)
5494                                 {
5495                                         thread->waiting = true;
5496                                         Thread_CondWait(thread->waitcond, thread->drawmutex);
5497                                         thread->waiting = false;
5498                                 }
5499                                 Thread_UnlockMutex(thread->drawmutex);
5500                         }
5501                 }
5502         }
5503         else
5504         {
5505                 for (i = 0; i < dpsoftrast.numthreads; i++)
5506                 {
5507                         thread = &dpsoftrast.threads[i];
5508                         if (thread->commandoffset != dpsoftrast.drawcommand)
5509                                 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5510                 }
5511         }
5512         dpsoftrast.commandpool.usedcommands = 0;
5513 }
5514
5515 void DPSOFTRAST_Flush(void)
5516 {
5517         DPSOFTRAST_Draw_FlushThreads();
5518 }
5519
5520 void DPSOFTRAST_Finish(void)
5521 {
5522         DPSOFTRAST_Flush();
5523 }
5524
5525 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5526 {
5527         int i;
5528         union
5529         {
5530                 int i;
5531                 unsigned char b[4];
5532         }
5533         u;
5534         u.i = 1;
5535         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5536         dpsoftrast.bigendian = u.b[3];
5537         dpsoftrast.fb_width = width;
5538         dpsoftrast.fb_height = height;
5539         dpsoftrast.fb_depthpixels = depthpixels;
5540         dpsoftrast.fb_colorpixels[0] = colorpixels;
5541         dpsoftrast.fb_colorpixels[1] = NULL;
5542         dpsoftrast.fb_colorpixels[1] = NULL;
5543         dpsoftrast.fb_colorpixels[1] = NULL;
5544         dpsoftrast.viewport[0] = 0;
5545         dpsoftrast.viewport[1] = 0;
5546         dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5547         dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5548         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5549         dpsoftrast.texture_firstfree = 1;
5550         dpsoftrast.texture_end = 1;
5551         dpsoftrast.texture_max = 0;
5552         dpsoftrast.color[0] = 1;
5553         dpsoftrast.color[1] = 1;
5554         dpsoftrast.color[2] = 1;
5555         dpsoftrast.color[3] = 1;
5556         dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5557         dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5558         dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5559         dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5560         for (i = 0; i < dpsoftrast.numthreads; i++)
5561         {
5562                 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5563                 thread->index = i;
5564                 thread->cullface = GL_BACK;
5565                 thread->colormask[1] = 1;
5566                 thread->colormask[2] = 1;
5567                 thread->colormask[3] = 1;
5568                 thread->blendfunc[0] = GL_ONE;
5569                 thread->blendfunc[1] = GL_ZERO;
5570                 thread->depthmask = true;
5571                 thread->depthtest = true;
5572                 thread->depthfunc = GL_LEQUAL;
5573                 thread->scissortest = false;
5574                 thread->alphatest = false;
5575                 thread->alphafunc = GL_GREATER;
5576                 thread->alphavalue = 0.5f;
5577                 thread->viewport[0] = 0;
5578                 thread->viewport[1] = 0;
5579                 thread->viewport[2] = dpsoftrast.fb_width;
5580                 thread->viewport[3] = dpsoftrast.fb_height;
5581                 thread->scissor[0] = 0;
5582                 thread->scissor[1] = 0;
5583                 thread->scissor[2] = dpsoftrast.fb_width;
5584                 thread->scissor[3] = dpsoftrast.fb_height;
5585                 thread->depthrange[0] = 0;
5586                 thread->depthrange[1] = 1;
5587                 thread->polygonoffset[0] = 0;
5588                 thread->polygonoffset[1] = 0;
5589                 thread->clipplane[0] = 0;
5590                 thread->clipplane[1] = 0;
5591                 thread->clipplane[2] = 0;
5592                 thread->clipplane[3] = 1;
5593         
5594                 thread->numspans = 0;
5595                 thread->numtriangles = 0;
5596                 thread->commandoffset = 0;
5597                 thread->waiting = false;
5598                 thread->starving = false;
5599            
5600                 thread->validate = -1;
5601                 DPSOFTRAST_Validate(thread, -1);
5602  
5603                 if (dpsoftrast.usethreads)
5604                 {
5605                         thread->waitcond = Thread_CreateCond();
5606                         thread->drawcond = Thread_CreateCond();
5607                         thread->drawmutex = Thread_CreateMutex();
5608                         thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5609                 }
5610         }
5611         return 0;
5612 }
5613
5614 void DPSOFTRAST_Shutdown(void)
5615 {
5616         int i;
5617         if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5618         {
5619                 DPSOFTRAST_State_Thread *thread;
5620                 for (i = 0; i < dpsoftrast.numthreads; i++)
5621                 {
5622                         thread = &dpsoftrast.threads[i];
5623                         Thread_LockMutex(thread->drawmutex);
5624                         thread->index = -1;
5625                         Thread_CondSignal(thread->drawcond);
5626                         Thread_UnlockMutex(thread->drawmutex);
5627                         Thread_WaitThread(thread->thread, 0);
5628                         Thread_DestroyCond(thread->waitcond);
5629                         Thread_DestroyCond(thread->drawcond);
5630                         Thread_DestroyMutex(thread->drawmutex);
5631                 }
5632         }
5633         for (i = 0;i < dpsoftrast.texture_end;i++)
5634                 if (dpsoftrast.texture[i].bytes)
5635                         MM_FREE(dpsoftrast.texture[i].bytes);
5636         if (dpsoftrast.texture)
5637                 free(dpsoftrast.texture);
5638         if (dpsoftrast.threads)
5639                 MM_FREE(dpsoftrast.threads);
5640         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5641 }
5642