Fix MSVC++ 2015 warnings about variable scope and some narrowing conversions without...
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "thread.h"
7 #include "dpsoftrast.h"
8
9 #ifdef _MSC_VER
10 #pragma warning(disable : 4324)
11 #endif
12
13 #ifndef __cplusplus
14 typedef qboolean bool;
15 #endif
16
17 #define ALIGN_SIZE 16
18 #define ATOMIC_SIZE 4
19
20 #ifdef SSE_POSSIBLE
21         #if defined(__APPLE__)
22                 #include <libkern/OSAtomic.h>
23                 #define ALIGN(var) var __attribute__((__aligned__(16)))
24                 #define ATOMIC(var) var __attribute__((__aligned__(4)))
25                 #define MEMORY_BARRIER (_mm_sfence())
26                 #define ATOMIC_COUNTER volatile int32_t 
27                 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28                 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29                 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30         #elif defined(__GNUC__) && defined(WIN32)
31                 #define ALIGN(var) var __attribute__((__aligned__(16)))
32                 #define ATOMIC(var) var __attribute__((__aligned__(4)))
33                 #define MEMORY_BARRIER (_mm_sfence())
34                 //(__sync_synchronize())
35                 #define ATOMIC_COUNTER volatile LONG
36                 // this LONG * cast serves to fix an issue with broken mingw
37                 // packages on Ubuntu; these only declare the function to take
38                 // a LONG *, causing a compile error here. This seems to be
39                 // error- and warn-free on platforms that DO declare
40                 // InterlockedIncrement correctly, like mingw on Windows.
41                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement((LONG *) &(counter)))
42                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement((LONG *) &(counter)))
43                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd((LONG *) &(counter), (val)))
44         #elif defined(__GNUC__)
45                 #define ALIGN(var) var __attribute__((__aligned__(16)))
46                 #define ATOMIC(var) var __attribute__((__aligned__(4)))
47                 #define MEMORY_BARRIER (_mm_sfence())
48                 //(__sync_synchronize())
49                 #define ATOMIC_COUNTER volatile int
50                 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
51                 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
52                 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
53         #elif defined(_MSC_VER)
54                 #define ALIGN(var) __declspec(align(16)) var
55                 #define ATOMIC(var) __declspec(align(4)) var
56                 #define MEMORY_BARRIER (_mm_sfence())
57                 //(MemoryBarrier())
58                 #define ATOMIC_COUNTER volatile LONG
59                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
60                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
61                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
62         #endif
63 #endif
64
65 #ifndef ALIGN
66 #define ALIGN(var) var
67 #endif
68 #ifndef ATOMIC
69 #define ATOMIC(var) var
70 #endif
71 #ifndef MEMORY_BARRIER
72 #define MEMORY_BARRIER ((void)0)
73 #endif
74 #ifndef ATOMIC_COUNTER
75 #define ATOMIC_COUNTER int
76 #endif
77 #ifndef ATOMIC_INCREMENT
78 #define ATOMIC_INCREMENT(counter) (++(counter))
79 #endif
80 #ifndef ATOMIC_DECREMENT
81 #define ATOMIC_DECREMENT(counter) (--(counter))
82 #endif
83 #ifndef ATOMIC_ADD
84 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
85 #endif
86
87 #ifdef SSE_POSSIBLE
88 #include <emmintrin.h>
89
90 #if defined(__GNUC__) && (__GNUC < 4 || __GNUC_MINOR__ < 6) && !defined(__clang__)
91         #define _mm_cvtss_f32(val) (__builtin_ia32_vec_ext_v4sf ((__v4sf)(val), 0))
92 #endif
93
94 #define MM_MALLOC(size) _mm_malloc(size, ALIGN_SIZE)
95
96 static void *MM_CALLOC(size_t nmemb, size_t size)
97 {
98         void *ptr = _mm_malloc(nmemb*size, ALIGN_SIZE);
99         if (ptr != NULL) memset(ptr, 0, nmemb*size);
100         return ptr;
101 }
102
103 #define MM_FREE _mm_free
104 #else
105 #define MM_MALLOC(size) malloc(size)
106 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
107 #define MM_FREE free
108 #endif
109
110 typedef enum DPSOFTRAST_ARRAY_e
111 {
112         DPSOFTRAST_ARRAY_POSITION,
113         DPSOFTRAST_ARRAY_COLOR,
114         DPSOFTRAST_ARRAY_TEXCOORD0,
115         DPSOFTRAST_ARRAY_TEXCOORD1,
116         DPSOFTRAST_ARRAY_TEXCOORD2,
117         DPSOFTRAST_ARRAY_TEXCOORD3,
118         DPSOFTRAST_ARRAY_TEXCOORD4,
119         DPSOFTRAST_ARRAY_TEXCOORD5,
120         DPSOFTRAST_ARRAY_TEXCOORD6,
121         DPSOFTRAST_ARRAY_TEXCOORD7,
122         DPSOFTRAST_ARRAY_TOTAL
123 }
124 DPSOFTRAST_ARRAY;
125
126 typedef struct DPSOFTRAST_Texture_s
127 {
128         int flags;
129         int width;
130         int height;
131         int depth;
132         int sides;
133         DPSOFTRAST_TEXTURE_FILTER filter;
134         int mipmaps;
135         int size;
136         ATOMIC_COUNTER binds;
137         unsigned char *bytes;
138         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
139 }
140 DPSOFTRAST_Texture;
141
142 #define COMMAND_SIZE ALIGN_SIZE
143 #define COMMAND_ALIGN(var) ALIGN(var)
144
145 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
146 {
147         unsigned char opcode;
148         unsigned short commandsize;
149 }
150 DPSOFTRAST_Command);
151
152 enum { DPSOFTRAST_OPCODE_Reset = 0 };
153
154 #define DEFCOMMAND(opcodeval, name, fields) \
155         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
156         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
157         { \
158                 unsigned char opcode; \
159                 unsigned short commandsize; \
160                 fields \
161         } DPSOFTRAST_Command_##name );
162
163 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
164 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
165
166 typedef ALIGN(struct DPSOFTRAST_State_Command_Pool_s
167 {
168         int freecommand;
169         int usedcommands;
170         ALIGN(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
171 }
172 DPSOFTRAST_State_Command_Pool);
173
174 typedef ALIGN(struct DPSOFTRAST_State_Triangle_s
175 {
176         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
177         float w[3];
178         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
179 }
180 DPSOFTRAST_State_Triangle);
181
182 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
183         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
184         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
185                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
186                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
187 }
188 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
189         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
190         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
191         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
192         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
193         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
194         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
195         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
196         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
197 }
198                                         
199 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
200
201 typedef ALIGN(struct DPSOFTRAST_State_Span_s
202 {
203         int triangle; // triangle this span was generated by
204         int x; // framebuffer x coord
205         int y; // framebuffer y coord
206         int startx; // usable range (according to pixelmask)
207         int endx; // usable range (according to pixelmask)
208         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
209         int depthbase; // depthbuffer value at x (add depthslope*startx to get first pixel's depthbuffer value)
210         int depthslope; // depthbuffer value pixel delta
211 }
212 DPSOFTRAST_State_Span);
213
214 #define DPSOFTRAST_DRAW_MAXSPANS 1024
215 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
216 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
217
218 #define DPSOFTRAST_VALIDATE_FB 1
219 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
220 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
221 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
222
223 typedef enum DPSOFTRAST_BLENDMODE_e
224 {
225         DPSOFTRAST_BLENDMODE_OPAQUE,
226         DPSOFTRAST_BLENDMODE_ALPHA,
227         DPSOFTRAST_BLENDMODE_ADDALPHA,
228         DPSOFTRAST_BLENDMODE_ADD,
229         DPSOFTRAST_BLENDMODE_INVMOD,
230         DPSOFTRAST_BLENDMODE_MUL,
231         DPSOFTRAST_BLENDMODE_MUL2,
232         DPSOFTRAST_BLENDMODE_SUBALPHA,
233         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
234         DPSOFTRAST_BLENDMODE_INVADD,
235         DPSOFTRAST_BLENDMODE_TOTAL
236 }
237 DPSOFTRAST_BLENDMODE;
238
239 typedef ALIGN(struct DPSOFTRAST_State_Thread_s
240 {
241         void *thread;
242         int index;
243         
244         int cullface;
245         int colormask[4];
246         int blendfunc[2];
247         int blendsubtract;
248         int depthmask;
249         int depthtest;
250         int depthfunc;
251         int scissortest;
252         int viewport[4];
253         int scissor[4];
254         float depthrange[2];
255         float polygonoffset[2];
256         float clipplane[4];
257         ALIGN(float fb_clipplane[4]);
258
259         int shader_mode;
260         int shader_permutation;
261         int shader_exactspecularmath;
262
263         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
264         
265         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
266         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
267
268         // DPSOFTRAST_VALIDATE_ flags
269         int validate;
270
271         // derived values (DPSOFTRAST_VALIDATE_FB)
272         int fb_colormask;
273         int fb_scissor[4];
274         ALIGN(float fb_viewportcenter[4]);
275         ALIGN(float fb_viewportscale[4]);
276
277         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
278         int fb_depthfunc;
279
280         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
281         int fb_blendmode;
282
283         // band boundaries
284         int miny1;
285         int maxy1;
286         int miny2;
287         int maxy2;
288
289         ATOMIC(volatile int commandoffset);
290
291         volatile bool waiting;
292         volatile bool starving;
293         void *waitcond;
294         void *drawcond;
295         void *drawmutex;
296
297         int numspans;
298         int numtriangles;
299         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
300         DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
301         unsigned char pixelmaskarray[DPSOFTRAST_DRAW_MAXSPANLENGTH+4]; // LordHavoc: padded to allow some termination bytes
302 }
303 DPSOFTRAST_State_Thread);
304
305 typedef ALIGN(struct DPSOFTRAST_State_s
306 {
307         int fb_width;
308         int fb_height;
309         unsigned int *fb_depthpixels;
310         unsigned int *fb_colorpixels[4];
311
312         int viewport[4];
313         ALIGN(float fb_viewportcenter[4]);
314         ALIGN(float fb_viewportscale[4]);
315
316         float color[4];
317         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
318         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
319
320         const float *pointer_vertex3f;
321         const float *pointer_color4f;
322         const unsigned char *pointer_color4ub;
323         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
324         int stride_vertex;
325         int stride_color;
326         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
327         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
328         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
329
330         int firstvertex;
331         int numvertices;
332         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
333         float *screencoord4f;
334         int drawstarty;
335         int drawendy;
336         int drawclipped;
337         
338         int shader_mode;
339         int shader_permutation;
340         int shader_exactspecularmath;
341
342         int texture_max;
343         int texture_end;
344         int texture_firstfree;
345         DPSOFTRAST_Texture *texture;
346
347         int bigendian;
348
349         // error reporting
350         const char *errorstring;
351
352         bool usethreads;
353         int interlace;
354         int numthreads;
355         DPSOFTRAST_State_Thread *threads;
356
357         ATOMIC(volatile int drawcommand);
358
359         DPSOFTRAST_State_Command_Pool commandpool;
360 }
361 DPSOFTRAST_State);
362
363 DPSOFTRAST_State dpsoftrast;
364
365 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
366 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
367 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
368 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
369
370 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span);
371 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span);
372
373 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
374 {
375         fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
376         fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
377         fb_viewportcenter[3] = 0.5f;
378         fb_viewportcenter[0] = 0.0f;
379         fb_viewportscale[1] = 0.5f * viewport[2];
380         fb_viewportscale[2] = -0.5f * viewport[3];
381         fb_viewportscale[3] = 0.5f;
382         fb_viewportscale[0] = 1.0f;
383 }
384
385 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
386 {
387         if (dpsoftrast.interlace)
388         {
389                 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
390                 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
391                 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
392                 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
393         }
394         else
395         {
396                 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
397                 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
398         }
399 }
400
401 static void DPSOFTRAST_RecalcClipPlane(DPSOFTRAST_State_Thread *thread)
402 {
403         thread->fb_clipplane[0] = thread->clipplane[0] / thread->fb_viewportscale[1];
404         thread->fb_clipplane[1] = thread->clipplane[1] / thread->fb_viewportscale[2];
405         thread->fb_clipplane[2] = thread->clipplane[2] / thread->fb_viewportscale[3];
406         thread->fb_clipplane[3] = thread->clipplane[3] / thread->fb_viewportscale[0];
407         thread->fb_clipplane[3] -= thread->fb_viewportcenter[1]*thread->fb_clipplane[0] + thread->fb_viewportcenter[2]*thread->fb_clipplane[1] + thread->fb_viewportcenter[3]*thread->fb_clipplane[2] + thread->fb_viewportcenter[0]*thread->fb_clipplane[3];
408 }
409
410 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
411 {
412         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
413         // and viewport projection values
414         int x1, x2;
415         int y1, y2;
416         x1 = thread->scissor[0];
417         x2 = thread->scissor[0] + thread->scissor[2];
418         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
419         y2 = dpsoftrast.fb_height - thread->scissor[1];
420         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
421         if (x1 < 0) x1 = 0;
422         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
423         if (y1 < 0) y1 = 0;
424         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
425         thread->fb_scissor[0] = x1;
426         thread->fb_scissor[1] = y1;
427         thread->fb_scissor[2] = x2 - x1;
428         thread->fb_scissor[3] = y2 - y1;
429
430         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
431         DPSOFTRAST_RecalcClipPlane(thread);
432         DPSOFTRAST_RecalcThread(thread);
433 }
434
435 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
436 {
437         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
438 }
439
440 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
441 {
442         if (thread->blendsubtract)
443         {
444                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
445                 {
446                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
447                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
448                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
449                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
450                 }
451         }
452         else
453         {       
454                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
455                 {
456                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
457                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
458                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
459                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
460                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
461                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
462                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
463                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
464                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
465                 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
466                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
467                 }
468         }
469 }
470
471 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
472
473 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
474 {
475         mask &= thread->validate;
476         if (!mask)
477                 return;
478         if (mask & DPSOFTRAST_VALIDATE_FB)
479         {
480                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
481                 DPSOFTRAST_RecalcFB(thread);
482         }
483         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
484         {
485                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
486                 DPSOFTRAST_RecalcDepthFunc(thread);
487         }
488         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
489         {
490                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
491                 DPSOFTRAST_RecalcBlendFunc(thread);
492         }
493 }
494
495 static DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
496 {
497         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
498                 return &dpsoftrast.texture[index];
499         return NULL;
500 }
501
502 static void DPSOFTRAST_Texture_Grow(void)
503 {
504         DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
505         DPSOFTRAST_State_Thread *thread;
506         int i;
507         int j;
508         DPSOFTRAST_Flush();
509         // expand texture array as needed
510         if (dpsoftrast.texture_max < 1024)
511                 dpsoftrast.texture_max = 1024;
512         else
513                 dpsoftrast.texture_max *= 2;
514         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
515         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
516                 if (dpsoftrast.texbound[i])
517                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
518         for (j = 0; j < dpsoftrast.numthreads; j++)
519         {
520                 thread = &dpsoftrast.threads[j];
521                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
522                         if (thread->texbound[i])
523                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
524         }
525 }
526
527 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
528 {
529         int w;
530         int h;
531         int d;
532         int size;
533         int s;
534         int texnum;
535         int mipmaps;
536         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
537         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
538         DPSOFTRAST_Texture *texture;
539         if (width*height*depth < 1)
540         {
541                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
542                 return 0;
543         }
544         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
545         {
546                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
547                 return 0;
548         }
549         switch(texformat)
550         {
551         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
552         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
553         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
554                 break;
555         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
556                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
557                 {
558                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
559                         return 0;
560                 }
561                 if (depth != 1)
562                 {
563                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
564                         return 0;
565                 }
566                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
567                 {
568                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
569                         return 0;
570                 }
571                 break;
572         }
573         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
574         {
575                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
576                 return 0;
577         }
578         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
579         {
580                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
581                 return 0;
582         }
583         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
584         {
585                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
586                 return 0;
587         }
588         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
589         {
590                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
591                 return 0;
592         }
593         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
594         {
595                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
596                 return 0;
597         }
598         // find first empty slot in texture array
599         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
600                 if (!dpsoftrast.texture[texnum].bytes)
601                         break;
602         dpsoftrast.texture_firstfree = texnum + 1;
603         if (dpsoftrast.texture_max <= texnum)
604                 DPSOFTRAST_Texture_Grow();
605         if (dpsoftrast.texture_end <= texnum)
606                 dpsoftrast.texture_end = texnum + 1;
607         texture = &dpsoftrast.texture[texnum];
608         memset(texture, 0, sizeof(*texture));
609         texture->flags = flags;
610         texture->width = width;
611         texture->height = height;
612         texture->depth = depth;
613         texture->sides = sides;
614         texture->binds = 0;
615         w = width;
616         h = height;
617         d = depth;
618         size = 0;
619         mipmaps = 0;
620         for (;;)
621         {
622                 s = w * h * d * sides * 4;
623                 texture->mipmap[mipmaps][0] = size;
624                 texture->mipmap[mipmaps][1] = s;
625                 texture->mipmap[mipmaps][2] = w;
626                 texture->mipmap[mipmaps][3] = h;
627                 texture->mipmap[mipmaps][4] = d;
628                 size += s;
629                 mipmaps++;
630                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
631                         break;
632                 if (w > 1) w >>= 1;
633                 if (h > 1) h >>= 1;
634                 if (d > 1) d >>= 1;
635         }
636         texture->mipmaps = mipmaps;
637         texture->size = size;
638
639         // allocate the pixels now
640         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
641
642         return texnum;
643 }
644 void DPSOFTRAST_Texture_Free(int index)
645 {
646         DPSOFTRAST_Texture *texture;
647         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
648         if (texture->binds)
649                 DPSOFTRAST_Flush();
650         if (texture->bytes)
651                 MM_FREE(texture->bytes);
652         texture->bytes = NULL;
653         memset(texture, 0, sizeof(*texture));
654         // adjust the free range and used range
655         if (dpsoftrast.texture_firstfree > index)
656                 dpsoftrast.texture_firstfree = index;
657         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
658                 dpsoftrast.texture_end--;
659 }
660 static void DPSOFTRAST_Texture_CalculateMipmaps(int index)
661 {
662         int i, x, y, z, w, layer0, layer1, row0, row1;
663         unsigned char *o, *i0, *i1, *i2, *i3;
664         DPSOFTRAST_Texture *texture;
665         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
666         if (texture->mipmaps <= 1)
667                 return;
668         for (i = 1;i < texture->mipmaps;i++)
669         {
670                 for (z = 0;z < texture->mipmap[i][4];z++)
671                 {
672                         layer0 = z*2;
673                         layer1 = z*2+1;
674                         if (layer1 >= texture->mipmap[i-1][4])
675                                 layer1 = texture->mipmap[i-1][4]-1;
676                         for (y = 0;y < texture->mipmap[i][3];y++)
677                         {
678                                 row0 = y*2;
679                                 row1 = y*2+1;
680                                 if (row1 >= texture->mipmap[i-1][3])
681                                         row1 = texture->mipmap[i-1][3]-1;
682                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
683                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
684                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
685                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
686                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
687                                 w = texture->mipmap[i][2];
688                                 if (layer1 > layer0)
689                                 {
690                                         if (texture->mipmap[i-1][2] > 1)
691                                         {
692                                                 // average 3D texture
693                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
694                                                 {
695                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
696                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
697                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
698                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
699                                                 }
700                                         }
701                                         else
702                                         {
703                                                 // average 3D mipmap with parent width == 1
704                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
705                                                 {
706                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
707                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
708                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
709                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
710                                                 }
711                                         }
712                                 }
713                                 else
714                                 {
715                                         if (texture->mipmap[i-1][2] > 1)
716                                         {
717                                                 // average 2D texture (common case)
718                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
719                                                 {
720                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
721                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
722                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
723                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
724                                                 }
725                                         }
726                                         else
727                                         {
728                                                 // 2D texture with parent width == 1
729                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
730                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
731                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
732                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
733                                         }
734                                 }
735                         }
736                 }
737         }
738 }
739 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
740 {
741         DPSOFTRAST_Texture *texture;
742         unsigned char *dst;
743         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
744         if (texture->binds)
745                 DPSOFTRAST_Flush();
746         if (pixels)
747         {
748                 dst = texture->bytes + texture->mipmap[0][1] +(-blocky * texture->mipmap[0][2] + blockx) * 4;
749                 while (blockheight > 0)
750                 {
751                         dst -= texture->mipmap[0][2] * 4;
752                         memcpy(dst, pixels, blockwidth * 4);
753                         pixels += blockwidth * 4;
754                         blockheight--;
755                 }
756         }
757         DPSOFTRAST_Texture_CalculateMipmaps(index);
758 }
759 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
760 {
761         DPSOFTRAST_Texture *texture;
762         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
763         if (texture->binds)
764                 DPSOFTRAST_Flush();
765         if (pixels)
766         {
767                 int i, stride = texture->mipmap[0][2]*4;
768                 unsigned char *dst = texture->bytes + texture->mipmap[0][1];
769                 for (i = texture->mipmap[0][3];i > 0;i--)
770                 {
771                         dst -= stride;
772                         memcpy(dst, pixels, stride);
773                         pixels += stride;
774                 }
775         }
776         DPSOFTRAST_Texture_CalculateMipmaps(index);
777 }
778 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
779 {
780         DPSOFTRAST_Texture *texture;
781         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
782         return texture->mipmap[mip][2];
783 }
784 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
785 {
786         DPSOFTRAST_Texture *texture;
787         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
788         return texture->mipmap[mip][3];
789 }
790 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
791 {
792         DPSOFTRAST_Texture *texture;
793         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
794         return texture->mipmap[mip][4];
795 }
796 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
797 {
798         DPSOFTRAST_Texture *texture;
799         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
800         if (texture->binds)
801                 DPSOFTRAST_Flush();
802         return texture->bytes + texture->mipmap[mip][0];
803 }
804 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
805 {
806         DPSOFTRAST_Texture *texture;
807         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
808         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
809         {
810                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
811                 return;
812         }
813         if (texture->binds)
814                 DPSOFTRAST_Flush();
815         texture->filter = filter;
816 }
817
818 static void DPSOFTRAST_Draw_FlushThreads(void);
819
820 static void DPSOFTRAST_Draw_SyncCommands(void)
821 {
822         if(dpsoftrast.usethreads) MEMORY_BARRIER;
823         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
824 }
825
826 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
827 {
828         DPSOFTRAST_State_Thread *thread;
829         int i;
830         int freecommand = dpsoftrast.commandpool.freecommand;
831         int usedcommands = dpsoftrast.commandpool.usedcommands;
832         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
833                 return;
834         DPSOFTRAST_Draw_SyncCommands();
835         for(;;)
836         {
837                 int waitindex = -1;
838                 int commandoffset;
839                 usedcommands = 0;
840                 for (i = 0; i < dpsoftrast.numthreads; i++)
841                 {
842                         thread = &dpsoftrast.threads[i]; 
843                         commandoffset = freecommand - thread->commandoffset;
844                         if (commandoffset < 0)
845                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
846                         if (commandoffset > usedcommands)
847                         {
848                                 waitindex = i;
849                                 usedcommands = commandoffset;
850                         }
851                 }
852                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
853                         break;
854                 thread = &dpsoftrast.threads[waitindex];
855                 Thread_LockMutex(thread->drawmutex);
856                 if (thread->commandoffset != dpsoftrast.drawcommand)
857                 {
858                         thread->waiting = true;
859                         if (thread->starving) Thread_CondSignal(thread->drawcond);
860                         Thread_CondWait(thread->waitcond, thread->drawmutex);
861                         thread->waiting = false;
862                 }
863                 Thread_UnlockMutex(thread->drawmutex);
864         }
865         dpsoftrast.commandpool.usedcommands = usedcommands;
866 }
867
868 #define DPSOFTRAST_ALIGNCOMMAND(size) \
869         ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
870 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
871         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
872
873 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
874 {
875         DPSOFTRAST_Command *command;
876         int freecommand = dpsoftrast.commandpool.freecommand;
877         int usedcommands = dpsoftrast.commandpool.usedcommands;
878         int extra = sizeof(DPSOFTRAST_Command);
879         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
880                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
881         if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
882         {
883                 if (dpsoftrast.usethreads)
884                         DPSOFTRAST_Draw_FreeCommandPool(size + extra);
885                 else
886                         DPSOFTRAST_Draw_FlushThreads();
887                 freecommand = dpsoftrast.commandpool.freecommand;
888                 usedcommands = dpsoftrast.commandpool.usedcommands;
889         }
890         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
891         {
892                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
893                 command->opcode = DPSOFTRAST_OPCODE_Reset;
894                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
895                 freecommand = 0;
896         }
897         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
898         command->opcode = opcode;
899         command->commandsize = size;
900         freecommand += size;
901         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
902                 freecommand = 0;
903         dpsoftrast.commandpool.freecommand = freecommand;
904         dpsoftrast.commandpool.usedcommands = usedcommands + size;
905         return command;
906 }
907
908 static void DPSOFTRAST_UndoCommand(int size)
909 {
910         int freecommand = dpsoftrast.commandpool.freecommand;
911         int usedcommands = dpsoftrast.commandpool.usedcommands;
912         freecommand -= size;
913         if (freecommand < 0)
914                 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
915         usedcommands -= size;
916         dpsoftrast.commandpool.freecommand = freecommand;
917         dpsoftrast.commandpool.usedcommands = usedcommands;
918 }
919                 
920 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
921 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
922 {
923         thread->viewport[0] = command->x;
924         thread->viewport[1] = command->y;
925         thread->viewport[2] = command->width;
926         thread->viewport[3] = command->height;
927         thread->validate |= DPSOFTRAST_VALIDATE_FB;
928 }
929 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
930 {
931         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
932         command->x = x;
933         command->y = y;
934         command->width = width;
935         command->height = height;
936
937         dpsoftrast.viewport[0] = x;
938         dpsoftrast.viewport[1] = y;
939         dpsoftrast.viewport[2] = width;
940         dpsoftrast.viewport[3] = height;
941         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
942 }
943
944 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
945 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
946 {
947         int i, x1, y1, x2, y2, w, h, x, y;
948         int miny1, maxy1, miny2, maxy2;
949         int bandy;
950         unsigned int *p;
951         unsigned int c;
952         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
953         miny1 = thread->miny1;
954         maxy1 = thread->maxy1;
955         miny2 = thread->miny2;
956         maxy2 = thread->maxy2;
957         x1 = thread->fb_scissor[0];
958         y1 = thread->fb_scissor[1];
959         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
960         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
961         if (y1 < miny1) y1 = miny1;
962         if (y2 > maxy2) y2 = maxy2;
963         w = x2 - x1;
964         h = y2 - y1;
965         if (w < 1 || h < 1)
966                 return;
967         // FIXME: honor fb_colormask?
968         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
969         for (i = 0;i < 4;i++)
970         {
971                 if (!dpsoftrast.fb_colorpixels[i])
972                         continue;
973                 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
974                 for (;y < bandy;y++)
975                 {
976                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
977                         for (x = x1;x < x2;x++)
978                                 p[x] = c;
979                 }
980         }
981 }
982 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
983 {
984         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
985         command->r = r;
986         command->g = g;
987         command->b = b;
988         command->a = a;
989 }
990
991 DEFCOMMAND(3, ClearDepth, float depth;)
992 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
993 {
994         int x1, y1, x2, y2, w, h, x, y;
995         int miny1, maxy1, miny2, maxy2;
996         int bandy;
997         unsigned int *p;
998         unsigned int c;
999         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
1000         miny1 = thread->miny1;
1001         maxy1 = thread->maxy1;
1002         miny2 = thread->miny2;
1003         maxy2 = thread->maxy2;
1004         x1 = thread->fb_scissor[0];
1005         y1 = thread->fb_scissor[1];
1006         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
1007         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
1008         if (y1 < miny1) y1 = miny1;
1009         if (y2 > maxy2) y2 = maxy2;
1010         w = x2 - x1;
1011         h = y2 - y1;
1012         if (w < 1 || h < 1)
1013                 return;
1014         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
1015         for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
1016         for (;y < bandy;y++)
1017         {
1018                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
1019                 for (x = x1;x < x2;x++)
1020                         p[x] = c;
1021         }
1022 }
1023 void DPSOFTRAST_ClearDepth(float d)
1024 {
1025         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
1026         command->depth = d;
1027 }
1028
1029 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
1030 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
1031 {
1032         thread->colormask[0] = command->r != 0;
1033         thread->colormask[1] = command->g != 0;
1034         thread->colormask[2] = command->b != 0;
1035         thread->colormask[3] = command->a != 0;
1036         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
1037 }
1038 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1039 {
1040         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1041         command->r = r;
1042         command->g = g;
1043         command->b = b;
1044         command->a = a;
1045 }
1046
1047 DEFCOMMAND(5, DepthTest, int enable;)
1048 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1049 {
1050         thread->depthtest = command->enable;
1051         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1052 }
1053 void DPSOFTRAST_DepthTest(int enable)
1054 {
1055         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1056         command->enable = enable;
1057 }
1058
1059 DEFCOMMAND(6, ScissorTest, int enable;)
1060 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1061 {
1062         thread->scissortest = command->enable;
1063         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1064 }
1065 void DPSOFTRAST_ScissorTest(int enable)
1066 {
1067         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1068         command->enable = enable;
1069 }
1070
1071 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1072 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1073 {
1074         thread->scissor[0] = command->x;
1075         thread->scissor[1] = command->y;
1076         thread->scissor[2] = command->width;
1077         thread->scissor[3] = command->height;
1078         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1079 }
1080 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1081 {
1082         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1083         command->x = x;
1084         command->y = y;
1085         command->width = width;
1086         command->height = height;
1087 }
1088
1089 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1090 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1091 {
1092         thread->blendfunc[0] = command->sfactor;
1093         thread->blendfunc[1] = command->dfactor;
1094         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1095 }
1096 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1097 {
1098         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1099         command->sfactor = sfactor;
1100         command->dfactor = dfactor;
1101 }
1102
1103 DEFCOMMAND(9, BlendSubtract, int enable;)
1104 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1105 {
1106         thread->blendsubtract = command->enable;
1107         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1108 }
1109 void DPSOFTRAST_BlendSubtract(int enable)
1110 {
1111         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1112         command->enable = enable;
1113 }
1114
1115 DEFCOMMAND(10, DepthMask, int enable;)
1116 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1117 {
1118         thread->depthmask = command->enable;
1119 }
1120 void DPSOFTRAST_DepthMask(int enable)
1121 {
1122         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1123         command->enable = enable;
1124 }
1125
1126 DEFCOMMAND(11, DepthFunc, int func;)
1127 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1128 {
1129         thread->depthfunc = command->func;
1130 }
1131 void DPSOFTRAST_DepthFunc(int func)
1132 {
1133         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1134         command->func = func;
1135 }
1136
1137 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1138 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1139 {
1140         thread->depthrange[0] = command->nearval;
1141         thread->depthrange[1] = command->farval;
1142 }
1143 void DPSOFTRAST_DepthRange(float nearval, float farval)
1144 {
1145         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1146         command->nearval = nearval;
1147         command->farval = farval;
1148 }
1149
1150 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1151 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1152 {
1153         thread->polygonoffset[0] = command->alongnormal;
1154         thread->polygonoffset[1] = command->intoview;
1155 }
1156 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1157 {
1158         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1159         command->alongnormal = alongnormal;
1160         command->intoview = intoview;
1161 }
1162
1163 DEFCOMMAND(14, CullFace, int mode;)
1164 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1165 {
1166         thread->cullface = command->mode;
1167 }
1168 void DPSOFTRAST_CullFace(int mode)
1169 {
1170         DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1171         command->mode = mode;
1172 }
1173
1174 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1175 {
1176         dpsoftrast.color[0] = r;
1177         dpsoftrast.color[1] = g;
1178         dpsoftrast.color[2] = b;
1179         dpsoftrast.color[3] = a;
1180 }
1181
1182 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1183 {
1184         int outstride = blockwidth * 4;
1185         int instride = dpsoftrast.fb_width * 4;
1186         int bx1 = blockx;
1187         int by1 = blocky;
1188         int bx2 = blockx + blockwidth;
1189         int by2 = blocky + blockheight;
1190         int bw;
1191         int x;
1192         int y;
1193         unsigned char *inpixels;
1194         unsigned char *b;
1195         unsigned char *o;
1196         DPSOFTRAST_Flush();
1197         if (bx1 < 0) bx1 = 0;
1198         if (by1 < 0) by1 = 0;
1199         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1200         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1201         bw = bx2 - bx1;
1202         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1203         if (dpsoftrast.bigendian)
1204         {
1205                 for (y = by1;y < by2;y++)
1206                 {
1207                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1208                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1209                         for (x = bx1;x < bx2;x++)
1210                         {
1211                                 o[0] = b[3];
1212                                 o[1] = b[2];
1213                                 o[2] = b[1];
1214                                 o[3] = b[0];
1215                                 o += 4;
1216                                 b += 4;
1217                         }
1218                 }
1219         }
1220         else
1221         {
1222                 for (y = by1;y < by2;y++)
1223                 {
1224                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1225                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1226                         memcpy(o, b, bw*4);
1227                 }
1228         }
1229
1230 }
1231 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1232 {
1233         int tx1 = tx;
1234         int ty1 = ty;
1235         int tx2 = tx + width;
1236         int ty2 = ty + height;
1237         int sx1 = sx;
1238         int sy1 = sy;
1239         int sx2 = sx + width;
1240         int sy2 = sy + height;
1241         int swidth;
1242         int sheight;
1243         int twidth;
1244         int theight;
1245         int sw;
1246         int sh;
1247         int tw;
1248         int th;
1249         int y;
1250         unsigned int *spixels;
1251         unsigned int *tpixels;
1252         DPSOFTRAST_Texture *texture;
1253         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1254         if (mip < 0 || mip >= texture->mipmaps) return;
1255         DPSOFTRAST_Flush();
1256         spixels = dpsoftrast.fb_colorpixels[0];
1257         swidth = dpsoftrast.fb_width;
1258         sheight = dpsoftrast.fb_height;
1259         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1260         twidth = texture->mipmap[mip][2];
1261         theight = texture->mipmap[mip][3];
1262         if (tx1 < 0) tx1 = 0;
1263         if (ty1 < 0) ty1 = 0;
1264         if (tx2 > twidth) tx2 = twidth;
1265         if (ty2 > theight) ty2 = theight;
1266         if (sx1 < 0) sx1 = 0;
1267         if (sy1 < 0) sy1 = 0;
1268         if (sx2 > swidth) sx2 = swidth;
1269         if (sy2 > sheight) sy2 = sheight;
1270         tw = tx2 - tx1;
1271         th = ty2 - ty1;
1272         sw = sx2 - sx1;
1273         sh = sy2 - sy1;
1274         if (tw > sw) tw = sw;
1275         if (th > sh) th = sh;
1276         if (tw < 1 || th < 1)
1277                 return;
1278         sy1 = sheight - sy1 - th;
1279         ty1 = theight - ty1 - th;
1280         for (y = 0;y < th;y++)
1281                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 + y) * swidth + sx1), tw*4);
1282         if (texture->mipmaps > 1)
1283                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1284 }
1285
1286 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1287 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1288 {
1289         if (thread->texbound[command->unitnum])
1290                 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1291         thread->texbound[command->unitnum] = command->texture;
1292 }
1293 void DPSOFTRAST_SetTexture(int unitnum, int index)
1294 {
1295         DPSOFTRAST_Command_SetTexture *command;
1296         DPSOFTRAST_Texture *texture;
1297         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1298         {
1299                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1300                 return;
1301         }
1302         texture = DPSOFTRAST_Texture_GetByIndex(index);
1303         if (index && !texture)
1304         {
1305                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1306                 return;
1307         }
1308
1309         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1310         command->unitnum = unitnum;
1311         command->texture = texture;
1312
1313         dpsoftrast.texbound[unitnum] = texture;
1314         if (texture)
1315                 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1316 }
1317
1318 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1319 {
1320         dpsoftrast.pointer_vertex3f = vertex3f;
1321         dpsoftrast.stride_vertex = (int)stride;
1322 }
1323 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1324 {
1325         dpsoftrast.pointer_color4f = color4f;
1326         dpsoftrast.pointer_color4ub = NULL;
1327         dpsoftrast.stride_color = (int)stride;
1328 }
1329 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1330 {
1331         dpsoftrast.pointer_color4f = NULL;
1332         dpsoftrast.pointer_color4ub = color4ub;
1333         dpsoftrast.stride_color = (int)stride;
1334 }
1335 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1336 {
1337         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1338         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1339         dpsoftrast.stride_texcoord[unitnum] = (int)stride;
1340 }
1341
1342 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1343 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1344 {
1345         thread->shader_mode = command->mode;
1346         thread->shader_permutation = command->permutation;
1347         thread->shader_exactspecularmath = command->exactspecularmath;
1348 }
1349 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1350 {
1351         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1352         command->mode = mode;
1353         command->permutation = permutation;
1354         command->exactspecularmath = exactspecularmath;
1355
1356         dpsoftrast.shader_mode = mode;
1357         dpsoftrast.shader_permutation = permutation;
1358         dpsoftrast.shader_exactspecularmath = exactspecularmath;
1359 }
1360
1361 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1362 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1363 {
1364         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1365 }
1366 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1367 {
1368         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1369         command->index = index;
1370         command->val[0] = v0;
1371         command->val[1] = v1;
1372         command->val[2] = v2;
1373         command->val[3] = v3;
1374
1375         dpsoftrast.uniform4f[index*4+0] = v0;
1376         dpsoftrast.uniform4f[index*4+1] = v1;
1377         dpsoftrast.uniform4f[index*4+2] = v2;
1378         dpsoftrast.uniform4f[index*4+3] = v3;
1379 }
1380 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1381 {
1382         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1383         command->index = index;
1384         memcpy(command->val, v, sizeof(command->val));
1385
1386         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1387 }
1388
1389 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1390 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1391 {
1392         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1393 }
1394 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1395 {
1396 #ifdef SSE_POSSIBLE
1397         int i, index;
1398         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1399         {
1400                 __m128 m0, m1, m2, m3;
1401                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1402                 command->index = (DPSOFTRAST_UNIFORM)index;
1403                 if (((size_t)v)&(ALIGN_SIZE-1))
1404                 {
1405                         m0 = _mm_loadu_ps(v);
1406                         m1 = _mm_loadu_ps(v+4);
1407                         m2 = _mm_loadu_ps(v+8);
1408                         m3 = _mm_loadu_ps(v+12);
1409                 }
1410                 else
1411                 {
1412                         m0 = _mm_load_ps(v);
1413                         m1 = _mm_load_ps(v+4);
1414                         m2 = _mm_load_ps(v+8);
1415                         m3 = _mm_load_ps(v+12);
1416                 }
1417                 if (transpose)
1418                 {
1419                         __m128 t0, t1, t2, t3;
1420                         t0 = _mm_unpacklo_ps(m0, m1);
1421                         t1 = _mm_unpacklo_ps(m2, m3);
1422                         t2 = _mm_unpackhi_ps(m0, m1);
1423                         t3 = _mm_unpackhi_ps(m2, m3);
1424                         m0 = _mm_movelh_ps(t0, t1);
1425                         m1 = _mm_movehl_ps(t1, t0);
1426                         m2 = _mm_movelh_ps(t2, t3);
1427                         m3 = _mm_movehl_ps(t3, t2);                     
1428                 }
1429                 _mm_store_ps(command->val, m0);
1430                 _mm_store_ps(command->val+4, m1);
1431                 _mm_store_ps(command->val+8, m2);
1432                 _mm_store_ps(command->val+12, m3);
1433                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1434                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1435                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1436                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1437         }
1438 #endif
1439 }
1440
1441 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1442 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1443 {
1444         thread->uniform1i[command->index] = command->val;
1445 }
1446 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1447 {
1448         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1449         command->index = index;
1450         command->val = i0;
1451
1452         dpsoftrast.uniform1i[command->index] = i0;
1453 }
1454
1455 DEFCOMMAND(24, ClipPlane, float clipplane[4];)
1456 static void DPSOFTRAST_Interpret_ClipPlane(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClipPlane *command)
1457 {
1458         memcpy(thread->clipplane, command->clipplane, 4*sizeof(float));
1459         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1460 }
1461 void DPSOFTRAST_ClipPlane(float x, float y, float z, float w)
1462 {
1463         DPSOFTRAST_Command_ClipPlane *command = DPSOFTRAST_ALLOCATECOMMAND(ClipPlane);
1464         command->clipplane[0] = x;
1465         command->clipplane[1] = y;
1466         command->clipplane[2] = z;
1467         command->clipplane[3] = w;
1468 }
1469
1470 #ifdef SSE_POSSIBLE
1471 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1472 {
1473         float *end = dst + size*4;
1474         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1475         {
1476                 while (dst < end)
1477                 {
1478                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1479                         dst += 4;
1480                         src += stride;
1481                 }
1482         }
1483         else
1484         {
1485                 while (dst < end)
1486                 {
1487                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1488                         dst += 4;
1489                         src += stride;
1490                 }
1491         }
1492 }
1493
1494 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1495 {
1496         float *end = dst + size*4;
1497         if (stride == sizeof(float[3]))
1498         {
1499                 float *end4 = dst + (size&~3)*4;        
1500                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1501                 {
1502                         while (dst < end4)
1503                         {
1504                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1505                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1506                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1507                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1508                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1509                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1510                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1511                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1512                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1513                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1514                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1515                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1516                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1517                                 dst += 16;
1518                                 src += 4*sizeof(float[3]);
1519                         }
1520                 }
1521                 else
1522                 {
1523                         while (dst < end4)
1524                         {
1525                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1526                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1527                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1528                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1529                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1530                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1531                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1532                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1533                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1534                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1535                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1536                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1537                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1538                                 dst += 16;
1539                                 src += 4*sizeof(float[3]);
1540                         }
1541                 }
1542         }
1543         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1544         {
1545                 while (dst < end)
1546                 {
1547                         __m128 v = _mm_loadu_ps((const float *)src);
1548                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1549                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1550                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1551                         _mm_store_ps(dst, v);
1552                         dst += 4;
1553                         src += stride;
1554                 }
1555         }
1556         else
1557         {
1558                 while (dst < end)
1559                 {
1560                         __m128 v = _mm_load_ps((const float *)src);
1561                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1562                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1563                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1564                         _mm_store_ps(dst, v);
1565                         dst += 4;
1566                         src += stride;
1567                 }
1568         }
1569 }
1570
1571 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1572 {
1573         float *end = dst + size*4;
1574         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1575         if (stride == sizeof(float[2]))
1576         {
1577                 float *end2 = dst + (size&~1)*4;
1578                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1579                 {
1580                         while (dst < end2)
1581                         {
1582                                 __m128 v = _mm_loadu_ps((const float *)src);
1583                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1584                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1585                                 dst += 8;
1586                                 src += 2*sizeof(float[2]);
1587                         }
1588                 }
1589                 else
1590                 {
1591                         while (dst < end2)
1592                         {
1593                                 __m128 v = _mm_load_ps((const float *)src);
1594                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1595                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1596                                 dst += 8;
1597                                 src += 2*sizeof(float[2]);
1598                         }
1599                 }
1600         }
1601         while (dst < end)
1602         {
1603                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1604                 dst += 4;
1605                 src += stride;
1606         }
1607 }
1608
1609 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1610 {
1611         float *end = dst + size*4;
1612         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1613         if (stride == sizeof(unsigned char[4]))
1614         {
1615                 float *end4 = dst + (size&~3)*4;
1616                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1617                 {
1618                         while (dst < end4)
1619                         {
1620                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1621                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1622                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1623                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1624                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1625                                 dst += 16;
1626                                 src += 4*sizeof(unsigned char[4]);
1627                         }
1628                 }
1629                 else
1630                 {
1631                         while (dst < end4)
1632                         {
1633                                 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1634                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1635                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1636                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1637                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1638                                 dst += 16;
1639                                 src += 4*sizeof(unsigned char[4]);
1640                         }
1641                 }
1642         }
1643         while (dst < end)
1644         {
1645                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1646                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1647                 dst += 4;
1648                 src += stride;
1649         }
1650 }
1651
1652 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1653 {
1654         float *end = dst + 4*size;
1655         __m128 v = _mm_loadu_ps(src);
1656         while (dst < end)
1657         {
1658                 _mm_store_ps(dst, v);
1659                 dst += 4;
1660         }
1661 }
1662 #endif
1663
1664 static void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1665 {
1666 #ifdef SSE_POSSIBLE
1667         static const float identitymatrix16f[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1668         __m128 m0, m1, m2, m3;
1669         float *end;
1670         if (!memcmp(identitymatrix16f, inmatrix16f, sizeof(float[16])))
1671         {
1672                 // fast case for identity matrix
1673                 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1674                 return;
1675         }
1676         end = out4f + numitems*4;
1677         m0 = _mm_loadu_ps(inmatrix16f);
1678         m1 = _mm_loadu_ps(inmatrix16f + 4);
1679         m2 = _mm_loadu_ps(inmatrix16f + 8);
1680         m3 = _mm_loadu_ps(inmatrix16f + 12);
1681         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1682         {
1683                 while (out4f < end)
1684                 {
1685                         __m128 v = _mm_loadu_ps(in4f);
1686                         _mm_store_ps(out4f,
1687                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1688                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1689                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1690                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1691                         out4f += 4;
1692                         in4f += 4;
1693                 }
1694         }
1695         else
1696         {
1697                 while (out4f < end)
1698                 {
1699                         __m128 v = _mm_load_ps(in4f);
1700                         _mm_store_ps(out4f,
1701                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1702                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1703                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1704                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1705                         out4f += 4;
1706                         in4f += 4;
1707                 }
1708         }
1709 #endif
1710 }
1711
1712 #if 0
1713 static void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1714 {
1715         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1716 }
1717 #endif
1718
1719 #ifdef SSE_POSSIBLE
1720 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1721 { \
1722         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1723         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1724         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1725         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1726 }
1727
1728 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1729 { \
1730         __m128 p = (in); \
1731         out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1732                                                   _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1733                                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1734                                                                                         _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1735 }
1736
1737 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1738 {
1739         int clipmask = 0xFF;
1740         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1741         __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1742         __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1743         __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1744         m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1745         m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1746         m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1747         m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1748         #define BBFRONT(k, pos) \
1749         { \
1750                 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1751                 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1752                 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1753                 { \
1754                         __m128 proj; \
1755                         clipmask &= ~(1<<k); \
1756                         proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1757                         minproj = _mm_min_ss(minproj, proj); \
1758                         maxproj = _mm_max_ss(maxproj, proj); \
1759                 } \
1760         }
1761         BBFRONT(0, minpos); 
1762         BBFRONT(1, _mm_move_ss(minpos, maxpos)); 
1763         BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1764         BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1765         BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1766         BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1767         BBFRONT(6, _mm_move_ss(maxpos, minpos)); 
1768         BBFRONT(7, maxpos);
1769         #define BBCLIP(k) \
1770         { \
1771                 if (clipmask&(1<<k)) \
1772                 { \
1773                         if (!(clipmask&(1<<(k^1)))) \
1774                         { \
1775                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1776                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1777                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1778                                 minproj = _mm_min_ss(minproj, proj); \
1779                                 maxproj = _mm_max_ss(maxproj, proj); \
1780                         } \
1781                         if (!(clipmask&(1<<(k^2)))) \
1782                         { \
1783                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1784                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1785                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1786                                 minproj = _mm_min_ss(minproj, proj); \
1787                                 maxproj = _mm_max_ss(maxproj, proj); \
1788                         } \
1789                         if (!(clipmask&(1<<(k^4)))) \
1790                         { \
1791                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1792                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1793                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1794                                 minproj = _mm_min_ss(minproj, proj); \
1795                                 maxproj = _mm_max_ss(maxproj, proj); \
1796                         } \
1797                 } \
1798         }
1799         BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1800         viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1801         viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1802         minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1803         maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1804         minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1805         maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1806         *starty = _mm_cvttss_si32(maxproj);
1807         *endy = _mm_cvttss_si32(minproj)+1;
1808         return clipmask;
1809 }
1810         
1811 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1812 {
1813         static const float identitymatrix16f[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1814         float *end = out4f + numitems*4;
1815         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1816         __m128 minpos, maxpos;
1817         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1818         {
1819                 minpos = maxpos = _mm_loadu_ps(in4f);
1820                 while (out4f < end)
1821                 {
1822                         __m128 v = _mm_loadu_ps(in4f);
1823                         minpos = _mm_min_ps(minpos, v);
1824                         maxpos = _mm_max_ps(maxpos, v);
1825                         _mm_store_ps(out4f, v);
1826                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1827                         _mm_store_ps(screen4f, v);
1828                         in4f += 4;
1829                         out4f += 4;
1830                         screen4f += 4;
1831                 }
1832         }
1833         else
1834         {
1835                 minpos = maxpos = _mm_load_ps(in4f);
1836                 while (out4f < end)
1837                 {
1838                         __m128 v = _mm_load_ps(in4f);
1839                         minpos = _mm_min_ps(minpos, v);
1840                         maxpos = _mm_max_ps(maxpos, v);
1841                         _mm_store_ps(out4f, v);
1842                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1843                         _mm_store_ps(screen4f, v);
1844                         in4f += 4;
1845                         out4f += 4;
1846                         screen4f += 4;
1847                 }
1848         }
1849         if (starty && endy) 
1850         {
1851                 ALIGN(float minposf[4]);
1852                 ALIGN(float maxposf[4]);
1853                 _mm_store_ps(minposf, minpos);
1854                 _mm_store_ps(maxposf, maxpos);
1855                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix16f);
1856         }
1857         return 0;
1858 }
1859
1860 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1861 {
1862         static const float identitymatrix16f[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1863         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1864         float *end;
1865         if (!memcmp(identitymatrix16f, inmatrix16f, sizeof(float[16])))
1866                 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1867         end = out4f + numitems*4;
1868         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1869         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1870         m0 = _mm_loadu_ps(inmatrix16f);
1871         m1 = _mm_loadu_ps(inmatrix16f + 4);
1872         m2 = _mm_loadu_ps(inmatrix16f + 8);
1873         m3 = _mm_loadu_ps(inmatrix16f + 12);
1874         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1875         {
1876                 minpos = maxpos = _mm_loadu_ps(in4f);
1877                 while (out4f < end)
1878                 {
1879                         __m128 v = _mm_loadu_ps(in4f);
1880                         minpos = _mm_min_ps(minpos, v);
1881                         maxpos = _mm_max_ps(maxpos, v);
1882                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1883                         _mm_store_ps(out4f, v);
1884                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1885                         _mm_store_ps(screen4f, v);
1886                         in4f += 4;
1887                         out4f += 4;
1888                         screen4f += 4;
1889                 }
1890         }
1891         else
1892         {
1893                 minpos = maxpos = _mm_load_ps(in4f);
1894                 while (out4f < end)
1895                 {
1896                         __m128 v = _mm_load_ps(in4f);
1897                         minpos = _mm_min_ps(minpos, v);
1898                         maxpos = _mm_max_ps(maxpos, v);
1899                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1900                         _mm_store_ps(out4f, v);
1901                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1902                         _mm_store_ps(screen4f, v);
1903                         in4f += 4;
1904                         out4f += 4;
1905                         screen4f += 4;
1906                 }
1907         }
1908         if (starty && endy) 
1909         {
1910                 ALIGN(float minposf[4]);
1911                 ALIGN(float maxposf[4]);
1912                 _mm_store_ps(minposf, minpos);
1913                 _mm_store_ps(maxposf, maxpos);
1914                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f); 
1915         }
1916         return 0;
1917 }
1918 #endif
1919
1920 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1921 {
1922 #ifdef SSE_POSSIBLE
1923         float *outf = dpsoftrast.post_array4f[outarray];
1924         const unsigned char *inb;
1925         int firstvertex = dpsoftrast.firstvertex;
1926         int numvertices = dpsoftrast.numvertices;
1927         int stride;
1928         switch(inarray)
1929         {
1930         case DPSOFTRAST_ARRAY_POSITION:
1931                 stride = dpsoftrast.stride_vertex;
1932                 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1933                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1934                 break;
1935         case DPSOFTRAST_ARRAY_COLOR:
1936                 stride = dpsoftrast.stride_color;
1937                 if (dpsoftrast.pointer_color4f)
1938                 {
1939                         inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1940                         DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1941                 }
1942                 else if (dpsoftrast.pointer_color4ub)
1943                 {
1944                         stride = dpsoftrast.stride_color;
1945                         inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1946                         DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1947                 }
1948                 else
1949                 {
1950                         DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1951                 }
1952                 break;
1953         default:
1954                 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1955                 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1956                 {
1957                         inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1958                         switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1959                         {
1960                         case 2:
1961                                 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1962                                 break;
1963                         case 3:
1964                                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1965                                 break;
1966                         case 4:
1967                                 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1968                                 break;
1969                         }
1970                 }
1971                 break;
1972         }
1973         return outf;
1974 #else
1975         return NULL;
1976 #endif
1977 }
1978
1979 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1980 {
1981         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1982         DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1983         return data;
1984 }
1985
1986 #if 0
1987 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1988 {
1989 #ifdef SSE_POSSIBLE
1990         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1991         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1992         return data;
1993 #else
1994         return NULL;
1995 #endif
1996 }
1997 #endif
1998
1999 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
2000 {
2001 #ifdef SSE_POSSIBLE
2002         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2003         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
2004         return data;
2005 #else
2006         return NULL;
2007 #endif
2008 }
2009
2010 static void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
2011 {
2012         int x;
2013         int startx = span->startx;
2014         int endx = span->endx;
2015         float wslope = triangle->w[0];
2016         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
2017         float endz = 1.0f / (w + wslope * startx);
2018         if (triangle->w[0] == 0)
2019         {
2020                 // LordHavoc: fast flat polygons (HUD/menu)
2021                 for (x = startx;x < endx;x++)
2022                         zf[x] = endz;
2023                 return;
2024         }
2025         for (x = startx;x < endx;)
2026         {
2027                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2028                 float z = endz, dz;
2029                 if (nextsub >= endx) nextsub = endsub = endx-1;
2030                 endz = 1.0f / (w + wslope * nextsub);
2031                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
2032                 for (; x <= endsub; x++, z += dz)
2033                         zf[x] = z;
2034         }
2035 }
2036
2037 static void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2038 {
2039 #ifdef SSE_POSSIBLE
2040         int x;
2041         int startx = span->startx;
2042         int endx = span->endx;
2043         int maskx;
2044         int subx;
2045         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2046         unsigned char * RESTRICT pixelmask = span->pixelmask;
2047         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2048         if (!pixeli)
2049                 return;
2050         pixeli += span->y * dpsoftrast.fb_width + span->x;
2051         // handle alphatest now (this affects depth writes too)
2052         if (thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL)
2053                 for (x = startx;x < endx;x++)
2054                         if (in4ub[x*4+3] < 128)
2055                                 pixelmask[x] = false;
2056         // LordHavoc: clear pixelmask for some pixels in alphablend cases, this
2057         // helps sprites, text and hud artwork
2058         switch(thread->fb_blendmode)
2059         {
2060         case DPSOFTRAST_BLENDMODE_ALPHA:
2061         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2062         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2063                 maskx = startx;
2064                 for (x = startx;x < endx;x++)
2065                 {
2066                         if (in4ub[x*4+3] >= 1)
2067                         {
2068                                 startx = x;
2069                                 for (;;)
2070                                 {
2071                                         while (++x < endx && in4ub[x*4+3] >= 1) ;
2072                                         maskx = x;
2073                                         if (x >= endx) break;
2074                                         ++x;
2075                                         while (++x < endx && in4ub[x*4+3] < 1) pixelmask[x] = false;
2076                                         if (x >= endx) break;
2077                                 }
2078                                 break;
2079                         }
2080                 }
2081                 endx = maskx;
2082                 break;
2083         case DPSOFTRAST_BLENDMODE_OPAQUE:
2084         case DPSOFTRAST_BLENDMODE_ADD:
2085         case DPSOFTRAST_BLENDMODE_INVMOD:
2086         case DPSOFTRAST_BLENDMODE_MUL:
2087         case DPSOFTRAST_BLENDMODE_MUL2:
2088         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2089         case DPSOFTRAST_BLENDMODE_INVADD:
2090                 break;
2091         }
2092         // put some special values at the end of the mask to ensure the loops end
2093         pixelmask[endx] = 1;
2094         pixelmask[endx+1] = 0;
2095         // LordHavoc: use a double loop to identify subspans, this helps the
2096         // optimized copy/blend loops to perform at their best, most triangles
2097         // have only one run of pixels, and do the search using wide reads...
2098         x = startx;
2099         while (x < endx)
2100         {
2101                 // if this pixel is masked off, it's probably not alone...
2102                 if (!pixelmask[x])
2103                 {
2104                         x++;
2105 #if 1
2106                         if (x + 8 < endx)
2107                         {
2108                                 // the 4-item search must be aligned or else it stalls badly
2109                                 if ((x & 3) && !pixelmask[x]) 
2110                                 {
2111                                         if(pixelmask[x]) goto endmasked;
2112                                         x++;
2113                                         if (x & 3)
2114                                         {
2115                                                 if(pixelmask[x]) goto endmasked;
2116                                                 x++;
2117                                                 if (x & 3)
2118                                                 {
2119                                                         if(pixelmask[x]) goto endmasked;
2120                                                         x++;
2121                                                 }
2122                                         }
2123                                 }
2124                                 while (*(unsigned int *)&pixelmask[x] == 0x00000000)
2125                                         x += 4;
2126                         }
2127 #endif
2128                         for (;!pixelmask[x];x++)
2129                                 ;
2130                         // rather than continue the loop, just check the end variable
2131                         if (x >= endx)
2132                                 break;
2133                 }
2134         endmasked:
2135                 // find length of subspan
2136                 subx = x + 1;
2137 #if 1
2138                 if (subx + 8 < endx)
2139                 {
2140                         if (subx & 3)
2141                         {
2142                                 if(!pixelmask[subx]) goto endunmasked;
2143                                 subx++;
2144                                 if (subx & 3)
2145                                 {
2146                                         if(!pixelmask[subx]) goto endunmasked;
2147                                         subx++;
2148                                         if (subx & 3)
2149                                         {
2150                                                 if(!pixelmask[subx]) goto endunmasked;
2151                                                 subx++;
2152                                         }
2153                                 }
2154                         }
2155                         while (*(unsigned int *)&pixelmask[subx] == 0x01010101)
2156                                 subx += 4;
2157                 }
2158 #endif
2159                 for (;pixelmask[subx];subx++)
2160                         ;
2161                 // the checks can overshoot, so make sure to clip it...
2162                 if (subx > endx)
2163                         subx = endx;
2164         endunmasked:
2165                 // now that we know the subspan length...  process!
2166                 switch(thread->fb_blendmode)
2167                 {
2168                 case DPSOFTRAST_BLENDMODE_OPAQUE:
2169 #if 0
2170                         if (subx - x >= 16)
2171                         {
2172                                 memcpy(pixeli + x, ini + x, (subx - x) * sizeof(pixeli[x]));
2173                                 x = subx;
2174                         }
2175                         else
2176 #elif 1
2177                         while (x + 16 <= subx)
2178                         {
2179                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2180                                 _mm_storeu_si128((__m128i *)&pixeli[x+4], _mm_loadu_si128((const __m128i *)&ini[x+4]));
2181                                 _mm_storeu_si128((__m128i *)&pixeli[x+8], _mm_loadu_si128((const __m128i *)&ini[x+8]));
2182                                 _mm_storeu_si128((__m128i *)&pixeli[x+12], _mm_loadu_si128((const __m128i *)&ini[x+12]));
2183                                 x += 16;
2184                         }
2185 #endif
2186                         {
2187                                 while (x + 4 <= subx)
2188                                 {
2189                                         _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2190                                         x += 4;
2191                                 }
2192                                 if (x + 2 <= subx)
2193                                 {
2194                                         pixeli[x] = ini[x];
2195                                         pixeli[x+1] = ini[x+1];
2196                                         x += 2;
2197                                 }
2198                                 if (x < subx)
2199                                 {
2200                                         pixeli[x] = ini[x];
2201                                         x++;
2202                                 }
2203                         }
2204                         break;
2205                 case DPSOFTRAST_BLENDMODE_ALPHA:
2206                 #define FINISHBLEND(blend2, blend1) \
2207                         for (;x + 1 < subx;x += 2) \
2208                         { \
2209                                 __m128i src, dst; \
2210                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2211                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2212                                 blend2; \
2213                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2214                         } \
2215                         if (x < subx) \
2216                         { \
2217                                 __m128i src, dst; \
2218                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2219                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2220                                 blend1; \
2221                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2222                                 x++; \
2223                         }
2224                         FINISHBLEND({
2225                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2226                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2227                         }, {
2228                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2229                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2230                         });
2231                         break;
2232                 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2233                         FINISHBLEND({
2234                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2235                                 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2236                         }, {
2237                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2238                                 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2239                         });
2240                         break;
2241                 case DPSOFTRAST_BLENDMODE_ADD:
2242                         FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2243                         break;
2244                 case DPSOFTRAST_BLENDMODE_INVMOD:
2245                         FINISHBLEND({
2246                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2247                         }, {
2248                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2249                         });
2250                         break;
2251                 case DPSOFTRAST_BLENDMODE_MUL:
2252                         FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2253                         break;
2254                 case DPSOFTRAST_BLENDMODE_MUL2:
2255                         FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2256                         break;
2257                 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2258                         FINISHBLEND({
2259                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2260                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2261                         }, {
2262                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2263                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2264                         });
2265                         break;
2266                 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2267                         FINISHBLEND({
2268                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2269                                 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2270                         }, {
2271                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2272                                 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2273                         });
2274                         break;
2275                 case DPSOFTRAST_BLENDMODE_INVADD:
2276                         FINISHBLEND({
2277                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2278                         }, {
2279                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2280                         });
2281                         break;
2282                 }
2283         }
2284 #endif
2285 }
2286
2287 static void DPSOFTRAST_Texture2DBGRA8(DPSOFTRAST_Texture *texture, int mip, float x, float y, unsigned char c[4])
2288         // warning: this is SLOW, only use if the optimized per-span functions won't do
2289 {
2290         const unsigned char * RESTRICT pixelbase;
2291         const unsigned char * RESTRICT pixel[4];
2292         int width = texture->mipmap[mip][2], height = texture->mipmap[mip][3];
2293         int wrapmask[2] = { width-1, height-1 };
2294         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0] + texture->mipmap[mip][1] - 4*width;
2295         if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
2296         {
2297                 unsigned int tc[2] = { (unsigned int)floor(x) * (width<<12) - 2048, (unsigned int)floor(y) * (height<<12) - 2048};
2298                 unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
2299                 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2300                 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2301                 int tci[2] = { (int)tc[0]>>12, (int)tc[1]>>12 };
2302                 int tci1[2] = { (int)tci[0] + 1, (int)tci[1] + 1 };
2303                 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2304                 {
2305                         tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2306                         tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2307                         tci1[0] = tci1[0] >= 0 ? (tci1[0] <= wrapmask[0] ? tci1[0] : wrapmask[0]) : 0;
2308                         tci1[1] = tci1[1] >= 0 ? (tci1[1] <= wrapmask[1] ? tci1[1] : wrapmask[1]) : 0;
2309                 }
2310                 else
2311                 {
2312                         tci[0] &= wrapmask[0];
2313                         tci[1] &= wrapmask[1];
2314                         tci1[0] &= wrapmask[0];
2315                         tci1[1] &= wrapmask[1];
2316                 }
2317                 pixel[0] = pixelbase + 4 * (tci[0] - tci[1]*width);
2318                 pixel[1] = pixelbase + 4 * (tci[0] - tci[1]*width);
2319                 pixel[2] = pixelbase + 4 * (tci[0] - tci1[1]*width);
2320                 pixel[3] = pixelbase + 4 * (tci[0] - tci1[1]*width);
2321                 c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
2322                 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
2323                 c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
2324                 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3])>>24;
2325         }
2326         else
2327         {
2328                 int tci[2] = { (int)floor(x) * width, (int)floor(y) * height };
2329                 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2330                 {
2331                         tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2332                         tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2333                 }
2334                 else
2335                 {
2336                         tci[0] &= wrapmask[0];
2337                         tci[1] &= wrapmask[1];
2338                 }
2339                 pixel[0] = pixelbase + 4 * (tci[0] - tci[1]*width);
2340                 c[0] = pixel[0][0];
2341                 c[1] = pixel[0][1];
2342                 c[2] = pixel[0][2];
2343                 c[3] = pixel[0][3];
2344         }
2345 }
2346
2347 #if 0
2348 static void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2349 {
2350         int x;
2351         int startx = span->startx;
2352         int endx = span->endx;
2353         int flags;
2354         float c[4];
2355         float data[4];
2356         float slope[4];
2357         float tc[2], endtc[2];
2358         float tcscale[2];
2359         unsigned int tci[2];
2360         unsigned int tci1[2];
2361         unsigned int tcimin[2];
2362         unsigned int tcimax[2];
2363         int tciwrapmask[2];
2364         int tciwidth;
2365         int filter;
2366         int mip;
2367         const unsigned char * RESTRICT pixelbase;
2368         const unsigned char * RESTRICT pixel[4];
2369         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2370         // if no texture is bound, just fill it with white
2371         if (!texture)
2372         {
2373                 for (x = startx;x < endx;x++)
2374                 {
2375                         out4f[x*4+0] = 1.0f;
2376                         out4f[x*4+1] = 1.0f;
2377                         out4f[x*4+2] = 1.0f;
2378                         out4f[x*4+3] = 1.0f;
2379                 }
2380                 return;
2381         }
2382         mip = triangle->mip[texunitindex];
2383         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0] + texture->mipmap[mip][1] - 4*texture->mipmap[mip][2];
2384         // if this mipmap of the texture is 1 pixel, just fill it with that color
2385         if (texture->mipmap[mip][1] == 4)
2386         {
2387                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2388                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2389                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2390                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2391                 for (x = startx;x < endx;x++)
2392                 {
2393                         out4f[x*4+0] = c[0];
2394                         out4f[x*4+1] = c[1];
2395                         out4f[x*4+2] = c[2];
2396                         out4f[x*4+3] = c[3];
2397                 }
2398                 return;
2399         }
2400         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2401         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2402         flags = texture->flags;
2403         tcscale[0] = texture->mipmap[mip][2];
2404         tcscale[1] = texture->mipmap[mip][3];
2405         tciwidth = -texture->mipmap[mip][2];
2406         tcimin[0] = 0;
2407         tcimin[1] = 0;
2408         tcimax[0] = texture->mipmap[mip][2]-1;
2409         tcimax[1] = texture->mipmap[mip][3]-1;
2410         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2411         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2412         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
2413         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
2414         if (filter)
2415         {
2416                 endtc[0] -= 0.5f;
2417                 endtc[1] -= 0.5f;
2418         }
2419         for (x = startx;x < endx;)
2420         {
2421                 unsigned int subtc[2];
2422                 unsigned int substep[2];
2423                 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2424                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2425                 if (nextsub >= endx)
2426                 {
2427                         nextsub = endsub = endx-1;      
2428                         if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2429                 }
2430                 tc[0] = endtc[0];
2431                 tc[1] = endtc[1];
2432                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2433                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2434                 if (filter)
2435                 {
2436                         endtc[0] -= 0.5f;
2437                         endtc[1] -= 0.5f;
2438                 }
2439                 substep[0] = (endtc[0] - tc[0]) * subscale;
2440                 substep[1] = (endtc[1] - tc[1]) * subscale;
2441                 subtc[0] = tc[0] * (1<<12);
2442                 subtc[1] = tc[1] * (1<<12);
2443                 if (filter)
2444                 {
2445                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2446                         {
2447                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2448                                 {
2449                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2450                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2451                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2452                                         tci[0] = subtc[0]>>12;
2453                                         tci[1] = subtc[1]>>12;
2454                                         tci1[0] = tci[0] + 1;
2455                                         tci1[1] = tci[1] + 1;
2456                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2457                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2458                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2459                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2460                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2461                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2462                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2463                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2464                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2465                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2466                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2467                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2468                                         out4f[x*4+0] = c[0];
2469                                         out4f[x*4+1] = c[1];
2470                                         out4f[x*4+2] = c[2];
2471                                         out4f[x*4+3] = c[3];
2472                                 }
2473                         }
2474                         else
2475                         {
2476                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2477                                 {
2478                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2479                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2480                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2481                                         tci[0] = subtc[0]>>12;
2482                                         tci[1] = subtc[1]>>12;
2483                                         tci1[0] = tci[0] + 1;
2484                                         tci1[1] = tci[1] + 1;
2485                                         tci[0] &= tciwrapmask[0];
2486                                         tci[1] &= tciwrapmask[1];
2487                                         tci1[0] &= tciwrapmask[0];
2488                                         tci1[1] &= tciwrapmask[1];
2489                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2490                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2491                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2492                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2493                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2494                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2495                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2496                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2497                                         out4f[x*4+0] = c[0];
2498                                         out4f[x*4+1] = c[1];
2499                                         out4f[x*4+2] = c[2];
2500                                         out4f[x*4+3] = c[3];
2501                                 }
2502                         }
2503                 }
2504                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2505                 {
2506                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2507                         {
2508                                 tci[0] = subtc[0]>>12;
2509                                 tci[1] = subtc[1]>>12;
2510                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2511                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2512                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2513                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2514                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2515                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2516                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2517                                 out4f[x*4+0] = c[0];
2518                                 out4f[x*4+1] = c[1];
2519                                 out4f[x*4+2] = c[2];
2520                                 out4f[x*4+3] = c[3];
2521                         }
2522                 }
2523                 else
2524                 {
2525                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2526                         {
2527                                 tci[0] = subtc[0]>>12;
2528                                 tci[1] = subtc[1]>>12;
2529                                 tci[0] &= tciwrapmask[0];
2530                                 tci[1] &= tciwrapmask[1];
2531                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2532                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2533                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2534                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2535                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2536                                 out4f[x*4+0] = c[0];
2537                                 out4f[x*4+1] = c[1];
2538                                 out4f[x*4+2] = c[2];
2539                                 out4f[x*4+3] = c[3];
2540                         }
2541                 }
2542         }
2543 }
2544 #endif
2545
2546 static void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2547 {
2548 #ifdef SSE_POSSIBLE
2549         int x;
2550         int startx = span->startx;
2551         int endx = span->endx;
2552         int flags;
2553         __m128 data, slope, tcscale;
2554         __m128i tcsize, tcmask, tcoffset, tcmax;
2555         __m128 tc, endtc;
2556         __m128i subtc, substep, endsubtc;
2557         int filter;
2558         int mip;
2559         int affine; // LordHavoc: optimized affine texturing case
2560         unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2561         const unsigned char * RESTRICT pixelbase;
2562         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2563         // if no texture is bound, just fill it with white
2564         if (!texture)
2565         {
2566                 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2567                 return;
2568         }
2569         mip = triangle->mip[texunitindex];
2570         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0] + texture->mipmap[mip][1] - 4*texture->mipmap[mip][2];
2571         // if this mipmap of the texture is 1 pixel, just fill it with that color
2572         if (texture->mipmap[mip][1] == 4)
2573         {
2574                 unsigned int k = *((const unsigned int *)pixelbase);
2575                 for (x = startx;x < endx;x++)
2576                         outi[x] = k;
2577                 return;
2578         }
2579         affine = zf[startx] == zf[endx-1];
2580         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2581         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2582         flags = texture->flags;
2583         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2584         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2585         tcscale = _mm_cvtepi32_ps(tcsize);
2586         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2587         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2588         endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2589         if (filter)
2590                 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2591         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2592         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_sub_epi32(_mm_setzero_si128(), _mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0))), 18), _mm_set1_epi32(4));
2593         tcmax = _mm_packs_epi32(tcmask, tcmask);
2594         for (x = startx;x < endx;)
2595         {
2596                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2597                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2598                 if (nextsub >= endx || affine)
2599                 {
2600                         nextsub = endsub = endx-1;
2601                         if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2602                 }       
2603                 tc = endtc;
2604                 subtc = endsubtc;
2605                 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2606                 if (filter)
2607                         endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2608                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2609                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2610                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2611                 substep = _mm_slli_epi32(substep, 1);
2612                 if (filter)
2613                 {
2614                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2615                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2616                         {
2617                                 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2618                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2619                                 {
2620                                         const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;                   
2621                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2622                                         tci = _mm_madd_epi16(tci, tcoffset);
2623                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2624                                         ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2625                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2626                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2627                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2628                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2629                                         fracm = _mm_srli_epi16(subtc, 1);
2630                                         pix1 = _mm_add_epi16(pix1,
2631                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2632                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2633                                         pix3 = _mm_add_epi16(pix3,
2634                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2635                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2636                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2637                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2638                                         pix2 = _mm_add_epi16(pix2,
2639                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2640                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2641                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2642                                 }
2643                                 if (x <= endsub)
2644                                 {
2645                                         const unsigned char * RESTRICT ptr1;
2646                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2647                                         tci = _mm_madd_epi16(tci, tcoffset);
2648                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2649                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2650                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2651                                         fracm = _mm_srli_epi16(subtc, 1);
2652                                         pix1 = _mm_add_epi16(pix1,
2653                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2654                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2655                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2656                                         pix1 = _mm_add_epi16(pix1,
2657                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2658                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2659                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2660                                         x++;
2661                                 }
2662                         }
2663                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2664                         {
2665                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2666                                 {
2667                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2668                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2669                                         tci = _mm_madd_epi16(tci, tcoffset);
2670                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2671                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2672                                                                                         _mm_setzero_si128());
2673                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2674                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2675                                                                                         _mm_setzero_si128());
2676                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2677                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2678                                         tci = _mm_madd_epi16(tci, tcoffset);
2679                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2680                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2681                                                                                         _mm_setzero_si128());
2682                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2683                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2684                                                                                         _mm_setzero_si128());
2685                                         fracm = _mm_srli_epi16(subtc, 1);
2686                                         pix1 = _mm_add_epi16(pix1,
2687                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2688                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2689                                         pix3 = _mm_add_epi16(pix3,
2690                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2691                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2692                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2693                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2694                                         pix2 = _mm_add_epi16(pix2,
2695                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2696                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2697                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2698                                 }
2699                                 if (x <= endsub)
2700                                 {
2701                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2702                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2703                                         tci = _mm_madd_epi16(tci, tcoffset);
2704                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2705                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2706                                                                                         _mm_setzero_si128());
2707                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2708                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2709                                                                                         _mm_setzero_si128());
2710                                         fracm = _mm_srli_epi16(subtc, 1);
2711                                         pix1 = _mm_add_epi16(pix1,
2712                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2713                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2714                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2715                                         pix1 = _mm_add_epi16(pix1,
2716                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2717                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2718                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2719                                         x++;
2720                                 }
2721                         }
2722                         else
2723                         {
2724                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2725                                 {
2726                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2727                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2728                                         tci = _mm_madd_epi16(tci, tcoffset);
2729                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2730                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2731                                                                                         _mm_setzero_si128());
2732                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2733                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2734                                                                                         _mm_setzero_si128());
2735                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2736                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2737                                         tci = _mm_madd_epi16(tci, tcoffset);
2738                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2739                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2740                                                                                         _mm_setzero_si128());
2741                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2742                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2743                                                                                         _mm_setzero_si128());
2744                                         fracm = _mm_srli_epi16(subtc, 1);
2745                                         pix1 = _mm_add_epi16(pix1,
2746                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2747                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2748                                         pix3 = _mm_add_epi16(pix3,
2749                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2750                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2751                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2752                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2753                                         pix2 = _mm_add_epi16(pix2,
2754                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2755                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2756                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2757                                 }
2758                                 if (x <= endsub)
2759                                 {
2760                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2761                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2762                                         tci = _mm_madd_epi16(tci, tcoffset);
2763                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2764                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2765                                                                                         _mm_setzero_si128());
2766                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2767                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2768                                                                                         _mm_setzero_si128());
2769                                         fracm = _mm_srli_epi16(subtc, 1);
2770                                         pix1 = _mm_add_epi16(pix1,
2771                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2772                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2773                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2774                                         pix1 = _mm_add_epi16(pix1,
2775                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2776                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2777                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2778                                         x++;
2779                                 }
2780                         }
2781                 }
2782                 else
2783                 {
2784                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2785                         {
2786                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2787                                 {
2788                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2789                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2790                                         tci = _mm_madd_epi16(tci, tcoffset);
2791                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2792                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2793                                 }
2794                                 if (x <= endsub)
2795                                 {
2796                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2797                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2798                                         tci = _mm_madd_epi16(tci, tcoffset);
2799                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2800                                         x++;
2801                                 }
2802                         }
2803                         else
2804                         {
2805                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2806                                 {
2807                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2808                                         tci = _mm_and_si128(tci, tcmax); 
2809                                         tci = _mm_madd_epi16(tci, tcoffset);
2810                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2811                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2812                                 }
2813                                 if (x <= endsub)
2814                                 {
2815                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2816                                         tci = _mm_and_si128(tci, tcmax); 
2817                                         tci = _mm_madd_epi16(tci, tcoffset);
2818                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2819                                         x++;
2820                                 }
2821                         }
2822                 }
2823         }
2824 #endif
2825 }
2826
2827 static void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2828 {
2829         // TODO: IMPLEMENT
2830         memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2831 }
2832
2833 static float DPSOFTRAST_SampleShadowmap(const float *vector)
2834 {
2835         // TODO: IMPLEMENT
2836         return 1.0f;
2837 }
2838
2839 #if 0
2840 static void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2841 {
2842         int x;
2843         int startx = span->startx;
2844         int endx = span->endx;
2845         float c[4];
2846         float data[4];
2847         float slope[4];
2848         float z;
2849         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2850         for (x = startx;x < endx;x++)
2851         {
2852                 z = zf[x];
2853                 c[0] = (data[0] + slope[0]*x) * z;
2854                 c[1] = (data[1] + slope[1]*x) * z;
2855                 c[2] = (data[2] + slope[2]*x) * z;
2856                 c[3] = (data[3] + slope[3]*x) * z;
2857                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2858                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2859                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2860                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2861         }
2862 }
2863 #endif
2864
2865 #if 0
2866 static void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2867 {
2868         int x;
2869         int startx = span->startx;
2870         int endx = span->endx;
2871         float c[4];
2872         float data[4];
2873         float slope[4];
2874         float z;
2875         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2876         for (x = startx;x < endx;x++)
2877         {
2878                 z = zf[x];
2879                 c[0] = (data[0] + slope[0]*x) * z;
2880                 c[1] = (data[1] + slope[1]*x) * z;
2881                 c[2] = (data[2] + slope[2]*x) * z;
2882                 c[3] = (data[3] + slope[3]*x) * z;
2883                 out4f[x*4+0] = c[0];
2884                 out4f[x*4+1] = c[1];
2885                 out4f[x*4+2] = c[2];
2886                 out4f[x*4+3] = c[3];
2887         }
2888 }
2889 #endif
2890
2891 #if 0
2892 static void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2893 {
2894         int x, startx = span->startx, endx = span->endx;
2895         float c[4], localcolor[4];
2896         localcolor[0] = subcolor[0];
2897         localcolor[1] = subcolor[1];
2898         localcolor[2] = subcolor[2];
2899         localcolor[3] = subcolor[3];
2900         for (x = startx;x < endx;x++)
2901         {
2902                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2903                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2904                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2905                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2906                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2907                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2908                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2909                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2910         }
2911 }
2912 #endif
2913
2914 #if 0
2915 static void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2916 {
2917         int x, startx = span->startx, endx = span->endx;
2918         for (x = startx;x < endx;x++)
2919         {
2920                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2921                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2922                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2923                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2924         }
2925 }
2926 #endif
2927
2928 #if 0
2929 static void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2930 {
2931         int x, startx = span->startx, endx = span->endx;
2932         for (x = startx;x < endx;x++)
2933         {
2934                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2935                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2936                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2937                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2938         }
2939 }
2940 #endif
2941
2942 #if 0
2943 static void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2944 {
2945         int x, startx = span->startx, endx = span->endx;
2946         float a, b;
2947         for (x = startx;x < endx;x++)
2948         {
2949                 a = 1.0f - inb4f[x*4+3];
2950                 b = inb4f[x*4+3];
2951                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2952                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2953                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2954                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2955         }
2956 }
2957 #endif
2958
2959 #if 0
2960 static void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2961 {
2962         int x, startx = span->startx, endx = span->endx;
2963         float localcolor[4], ilerp, lerp;
2964         localcolor[0] = color[0];
2965         localcolor[1] = color[1];