]> de.git.xonotic.org Git - xonotic/darkplaces.git/blob - dpsoftrast.c
Squashed commit of the following:
[xonotic/darkplaces.git] / dpsoftrast.c
1
2 #include <stdio.h>
3 #include <string.h>
4 #define _USE_MATH_DEFINES
5 #include <math.h>
6 #include "quakedef.h"
7 #include "dpsoftrast.h"
8
9 #ifndef __cplusplus
10 typedef qboolean bool;
11 #endif
12
13 #if defined(__GNUC__)
14 #define ALIGN(var) var __attribute__((__aligned__(16)))
15 #elif defined(_MSC_VER)
16 #define ALIGN(var) __declspec(align(16)) var
17 #else
18 #define ALIGN(var) var
19 #endif
20
21 #ifdef SSE2_PRESENT
22 #include <emmintrin.h>
23
24 #define MM_MALLOC(size) _mm_malloc(size, 16)
25
26 static void *MM_CALLOC(size_t nmemb, size_t size)
27 {
28         void *ptr = _mm_malloc(nmemb*size, 16);
29         if(ptr != NULL) memset(ptr, 0, nmemb*size);
30         return ptr;
31 }
32
33 #define MM_FREE _mm_free
34 #else
35 #define MM_MALLOC(size) malloc(size)
36 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
37 #define MM_FREE free
38 #endif
39
40 typedef enum DPSOFTRAST_ARRAY_e
41 {
42         DPSOFTRAST_ARRAY_POSITION,
43         DPSOFTRAST_ARRAY_COLOR,
44         DPSOFTRAST_ARRAY_TEXCOORD0,
45         DPSOFTRAST_ARRAY_TEXCOORD1,
46         DPSOFTRAST_ARRAY_TEXCOORD2,
47         DPSOFTRAST_ARRAY_TEXCOORD3,
48         DPSOFTRAST_ARRAY_TEXCOORD4,
49         DPSOFTRAST_ARRAY_TEXCOORD5,
50         DPSOFTRAST_ARRAY_TEXCOORD6,
51         DPSOFTRAST_ARRAY_TEXCOORD7,
52         DPSOFTRAST_ARRAY_TOTAL
53 }
54 DPSOFTRAST_ARRAY;
55
56 typedef struct DPSOFTRAST_Texture_s
57 {
58         int flags;
59         int width;
60         int height;
61         int depth;
62         int sides;
63         DPSOFTRAST_TEXTURE_FILTER filter;
64         int mipmaps;
65         int size;
66         unsigned char *bytes;
67         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
68 }
69 DPSOFTRAST_Texture;
70
71 typedef struct DPSOFTRAST_State_User_s
72 {
73         int colormask[4];
74         int blendfunc[2];
75         int blendsubtract;
76         int depthmask;
77         int depthtest;
78         int depthfunc;
79         int scissortest;
80         int cullface;
81         int alphatest;
82         int alphafunc;
83         float alphavalue;
84         int scissor[4];
85         int viewport[4];
86         float depthrange[2];
87         float polygonoffset[2];
88         float color[4];
89 }
90 DPSOFTRAST_State_User;
91
92 #define DPSOFTRAST_MAXSUBSPAN 16
93
94 typedef ALIGN(struct DPSOFTRAST_State_Draw_Span_s
95 {
96         int start; // pixel index
97         int length; // pixel count
98         int startx; // usable range (according to pixelmask)
99         int endx; // usable range (according to pixelmask)
100         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
101         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
102         // [0][n][] is start interpolant values (projected)
103         // [1][n][] is end interpolant values (projected)
104         // [0][DPSOFTRAST_ARRAY_TOTAL][] is start screencoord4f
105         // [1][DPSOFTRAST_ARRAY_TOTAL][] is end screencoord4f
106         // NOTE: screencoord4f[3] is W (basically 1/Z), useful for depthbuffer
107         ALIGN(float data[2][DPSOFTRAST_ARRAY_TOTAL+1][4]);
108 }
109 DPSOFTRAST_State_Draw_Span);
110
111 #define DPSOFTRAST_DRAW_MAXSPANQUEUE 1024
112
113 typedef struct DPSOFTRAST_State_Draw_s
114 {
115         int numvertices;
116         int maxvertices;
117         float *in_array4f[DPSOFTRAST_ARRAY_TOTAL];
118         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
119         float *screencoord4f;
120
121         // spans are queued in this structure for dispatch to the pixel shader,
122         // partly to improve cache locality, partly for batching purposes, spans
123         // are flushed before DrawTriangles returns to caller
124         int numspans;
125         DPSOFTRAST_State_Draw_Span spanqueue[DPSOFTRAST_DRAW_MAXSPANQUEUE];
126 }
127 DPSOFTRAST_State_Draw;
128
129 #define DPSOFTRAST_VALIDATE_FB 1
130 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
131 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
132 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
133
134 typedef enum DPSOFTRAST_BLENDMODE_e
135 {
136         DPSOFTRAST_BLENDMODE_OPAQUE,
137         DPSOFTRAST_BLENDMODE_ALPHA,
138         DPSOFTRAST_BLENDMODE_ADDALPHA,
139         DPSOFTRAST_BLENDMODE_ADD,
140         DPSOFTRAST_BLENDMODE_INVMOD,
141         DPSOFTRAST_BLENDMODE_MUL,
142         DPSOFTRAST_BLENDMODE_MUL2,
143         DPSOFTRAST_BLENDMODE_SUBALPHA,
144         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
145         DPSOFTRAST_BLENDMODE_TOTAL
146 }
147 DPSOFTRAST_BLENDMODE;
148
149 typedef ALIGN(struct DPSOFTRAST_State_s
150 {
151         // DPSOFTRAST_VALIDATE_ flags
152         int validate;
153
154         int fb_colormask;
155         int fb_width;
156         int fb_height;
157         unsigned int *fb_depthpixels;
158         unsigned int *fb_colorpixels[4];
159
160         const float *pointer_vertex3f;
161         const float *pointer_color4f;
162         const unsigned char *pointer_color4ub;
163         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
164         int stride_vertex;
165         int stride_color;
166         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
167         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
168         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
169
170         int shader_mode;
171         int shader_permutation;
172         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
173         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
174
175         // derived values (DPSOFTRAST_VALIDATE_FB)
176         int fb_clearscissor[4];
177         int fb_viewport[4];
178         int fb_viewportscissor[4];
179         ALIGN(float fb_viewportcenter[4]);
180         ALIGN(float fb_viewportscale[4]);
181
182         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
183         int fb_depthfunc;
184
185         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
186         int fb_blendmode;
187
188         int texture_max;
189         int texture_end;
190         int texture_firstfree;
191         DPSOFTRAST_Texture *texture;
192
193         int bigendian;
194
195         // error reporting
196         const char *errorstring;
197
198         DPSOFTRAST_State_User user;
199
200         DPSOFTRAST_State_Draw draw;
201 }
202 DPSOFTRAST_State);
203
204 DPSOFTRAST_State dpsoftrast;
205
206 extern int dpsoftrast_test;
207
208 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
209 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
210 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
211 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
212 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
213
214 void DPSOFTRAST_RecalcFB(void)
215 {
216         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
217         // and viewport projection values
218         int x1, x2, x3, x4, x5, x6;
219         int y1, y2, y3, y4, y5, y6;
220         x1 = dpsoftrast.user.scissor[0];
221         x2 = dpsoftrast.user.scissor[0] + dpsoftrast.user.scissor[2];
222         x3 = dpsoftrast.user.viewport[0];
223         x4 = dpsoftrast.user.viewport[0] + dpsoftrast.user.viewport[2];
224         y1 = dpsoftrast.fb_height - dpsoftrast.user.scissor[1] - dpsoftrast.user.scissor[3];
225         y2 = dpsoftrast.fb_height - dpsoftrast.user.scissor[1];
226         y3 = dpsoftrast.fb_height - dpsoftrast.user.viewport[1] - dpsoftrast.user.viewport[3];
227         y4 = dpsoftrast.fb_height - dpsoftrast.user.viewport[1];
228         if (!dpsoftrast.user.scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
229         if (x1 < 0) x1 = 0;
230         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
231         if (x3 < 0) x1 = 0;
232         if (x4 > dpsoftrast.fb_width) x4 = dpsoftrast.fb_width;
233         if (y1 < 0) y1 = 0;
234         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
235         if (y3 < 0) y1 = 0;
236         if (y4 > dpsoftrast.fb_height) y4 = dpsoftrast.fb_height;
237         x5 = x1;if (x5 < x3) x5 = x3;
238         x6 = x2;if (x6 > x4) x4 = x4;
239         y5 = y1;if (y5 < y3) y5 = y3;
240         y6 = y2;if (y6 > y4) y6 = y4;
241         dpsoftrast.fb_clearscissor[0] = x1;
242         dpsoftrast.fb_clearscissor[1] = y1;
243         dpsoftrast.fb_clearscissor[2] = x2 - x1;
244         dpsoftrast.fb_clearscissor[3] = y2 - y1;
245         dpsoftrast.fb_viewport[0] = x3;
246         dpsoftrast.fb_viewport[1] = y3;
247         dpsoftrast.fb_viewport[2] = x4 - x3;
248         dpsoftrast.fb_viewport[3] = y4 - y3;
249         dpsoftrast.fb_viewportscissor[0] = x5;
250         dpsoftrast.fb_viewportscissor[1] = y5;
251         dpsoftrast.fb_viewportscissor[2] = x6 - x5;
252         dpsoftrast.fb_viewportscissor[3] = y6 - y5;
253         dpsoftrast.fb_viewportcenter[1] = dpsoftrast.user.viewport[0] + 0.5f * dpsoftrast.user.viewport[2] - 0.5f;
254         dpsoftrast.fb_viewportcenter[2] = dpsoftrast.fb_height - dpsoftrast.user.viewport[1] - 0.5f * dpsoftrast.user.viewport[3] - 0.5f;
255         dpsoftrast.fb_viewportcenter[3] = 0.5f;
256         dpsoftrast.fb_viewportcenter[0] = 0.0f;
257         dpsoftrast.fb_viewportscale[1] = 0.5f * dpsoftrast.user.viewport[2];
258         dpsoftrast.fb_viewportscale[2] = -0.5f * dpsoftrast.user.viewport[3];
259         dpsoftrast.fb_viewportscale[3] = 0.5f;
260         dpsoftrast.fb_viewportscale[0] = 1.0f;
261 }
262
263 void DPSOFTRAST_RecalcDepthFunc(void)
264 {
265         dpsoftrast.fb_depthfunc = dpsoftrast.user.depthtest ? dpsoftrast.user.depthfunc : GL_ALWAYS;
266 }
267
268 int blendmodetable[][4] = 
269 {
270         {DPSOFTRAST_BLENDMODE_OPAQUE, GL_ONE, GL_ZERO, false},
271         {DPSOFTRAST_BLENDMODE_ALPHA, GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, false},
272         {DPSOFTRAST_BLENDMODE_ADDALPHA, GL_SRC_ALPHA, GL_ONE, false},
273         {DPSOFTRAST_BLENDMODE_ADD, GL_ONE, GL_ONE, false},
274         {DPSOFTRAST_BLENDMODE_INVMOD, GL_ZERO, GL_ONE_MINUS_SRC_COLOR, false},
275         {DPSOFTRAST_BLENDMODE_MUL, GL_ZERO, GL_SRC_COLOR, false},
276         {DPSOFTRAST_BLENDMODE_MUL, GL_DST_COLOR, GL_ZERO, false},
277         {DPSOFTRAST_BLENDMODE_MUL2, GL_DST_COLOR, GL_SRC_COLOR, false},
278         {DPSOFTRAST_BLENDMODE_PSEUDOALPHA, GL_ONE, GL_ONE_MINUS_SRC_ALPHA, false},
279         {DPSOFTRAST_BLENDMODE_SUBALPHA, GL_SRC_COLOR, GL_ONE, true}
280 };
281
282 void DPSOFTRAST_RecalcBlendFunc(void)
283 {
284         int i;
285         dpsoftrast.fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE;
286         for (i = 0;i < (int)(sizeof(blendmodetable) / sizeof(blendmodetable[0]));i++)
287         {
288                 if (dpsoftrast.user.blendfunc[0] == blendmodetable[i][1] && dpsoftrast.user.blendfunc[1] == blendmodetable[i][2] && dpsoftrast.user.blendsubtract == blendmodetable[i][3])
289                 {
290                         dpsoftrast.fb_blendmode = blendmodetable[i][0];
291                         break;
292                 }
293         }
294 }
295
296 #define DPSOFTRAST_ValidateQuick(f) ((dpsoftrast.validate & (f)) ? (DPSOFTRAST_Validate(f), 0) : 0)
297
298 void DPSOFTRAST_Validate(int mask)
299 {
300         mask &= dpsoftrast.validate;
301         if (!mask)
302                 return;
303         if (mask & DPSOFTRAST_VALIDATE_FB)
304         {
305                 dpsoftrast.validate &= ~DPSOFTRAST_VALIDATE_FB;
306                 DPSOFTRAST_RecalcFB();
307         }
308         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
309         {
310                 dpsoftrast.validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
311                 DPSOFTRAST_RecalcDepthFunc();
312         }
313         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
314         {
315                 dpsoftrast.validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
316                 DPSOFTRAST_RecalcBlendFunc();
317         }
318 }
319
320 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
321 {
322         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
323                 return &dpsoftrast.texture[index];
324         return NULL;
325 }
326
327 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
328 {
329         int w;
330         int h;
331         int d;
332         int size;
333         int s;
334         int texnum;
335         int mipmaps;
336         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
337         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
338         DPSOFTRAST_Texture *texture;
339         if (width*height*depth < 1)
340         {
341                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
342                 return 0;
343         }
344         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
345         {
346                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
347                 return 0;
348         }
349         switch(texformat)
350         {
351         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
352         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
353         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
354                 break;
355         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
356                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
357                 {
358                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
359                         return 0;
360                 }
361                 if (depth != 1)
362                 {
363                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
364                         return 0;
365                 }
366                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
367                 {
368                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
369                         return 0;
370                 }
371                 break;
372         }
373         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
374         {
375                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
376                 return 0;
377         }
378         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
379         {
380                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
381                 return 0;
382         }
383         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
384         {
385                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
386                 return 0;
387         }
388         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
389         {
390                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
391                 return 0;
392         }
393         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
394         {
395                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
396                 return 0;
397         }
398         // find first empty slot in texture array
399         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
400                 if (!dpsoftrast.texture[texnum].bytes)
401                         break;
402         dpsoftrast.texture_firstfree = texnum + 1;
403         if (dpsoftrast.texture_max <= texnum)
404         {
405                 // expand texture array as needed
406                 if (dpsoftrast.texture_max < 1024)
407                         dpsoftrast.texture_max = 1024;
408                 else
409                         dpsoftrast.texture_max *= 2;
410                 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
411         }
412         if (dpsoftrast.texture_end <= texnum)
413                 dpsoftrast.texture_end = texnum + 1;
414         texture = &dpsoftrast.texture[texnum];
415         memset(texture, 0, sizeof(*texture));
416         texture->flags = flags;
417         texture->width = width;
418         texture->height = height;
419         texture->depth = depth;
420         texture->sides = sides;
421         w = width;
422         h = height;
423         d = depth;
424         size = 0;
425         mipmaps = 0;
426         w = width;
427         h = height;
428         d = depth;
429         for (;;)
430         {
431                 s = w * h * d * sides * 4;
432                 texture->mipmap[mipmaps][0] = size;
433                 texture->mipmap[mipmaps][1] = s;
434                 texture->mipmap[mipmaps][2] = w;
435                 texture->mipmap[mipmaps][3] = h;
436                 texture->mipmap[mipmaps][4] = d;
437                 size += s;
438                 mipmaps++;
439                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
440                         break;
441                 if (w > 1) w >>= 1;
442                 if (h > 1) h >>= 1;
443                 if (d > 1) d >>= 1;
444         }
445         texture->mipmaps = mipmaps;
446         texture->size = size;
447
448         // allocate the pixels now
449         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
450
451         return texnum;
452 }
453 void DPSOFTRAST_Texture_Free(int index)
454 {
455         DPSOFTRAST_Texture *texture;
456         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
457         if (texture->bytes)
458                 MM_FREE(texture->bytes);
459         texture->bytes = NULL;
460         memset(texture, 0, sizeof(*texture));
461         // adjust the free range and used range
462         if (dpsoftrast.texture_firstfree > index)
463                 dpsoftrast.texture_firstfree = index;
464         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
465                 dpsoftrast.texture_end--;
466 }
467 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
468 {
469         int i, x, y, z, w, layer0, layer1, row0, row1;
470         unsigned char *o, *i0, *i1, *i2, *i3;
471         DPSOFTRAST_Texture *texture;
472         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
473         if (texture->mipmaps <= 1)
474                 return;
475         for (i = 1;i < texture->mipmaps;i++)
476         {
477                 for (z = 0;z < texture->mipmap[i][4];z++)
478                 {
479                         layer0 = z*2;
480                         layer1 = z*2+1;
481                         if (layer1 >= texture->mipmap[i-1][4])
482                                 layer1 = texture->mipmap[i-1][4]-1;
483                         for (y = 0;y < texture->mipmap[i][3];y++)
484                         {
485                                 row0 = y*2;
486                                 row1 = y*2+1;
487                                 if (row1 >= texture->mipmap[i-1][3])
488                                         row1 = texture->mipmap[i-1][3]-1;
489                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
490                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
491                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
492                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
493                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
494                                 w = texture->mipmap[i][2];
495                                 if (layer1 > layer0)
496                                 {
497                                         if (texture->mipmap[i-1][2] > 1)
498                                         {
499                                                 // average 3D texture
500                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
501                                                 {
502                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
503                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
504                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
505                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
506                                                 }
507                                         }
508                                         else
509                                         {
510                                                 // average 3D mipmap with parent width == 1
511                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
512                                                 {
513                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
514                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
515                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
516                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
517                                                 }
518                                         }
519                                 }
520                                 else
521                                 {
522                                         if (texture->mipmap[i-1][2] > 1)
523                                         {
524                                                 // average 2D texture (common case)
525                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
526                                                 {
527                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
528                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
529                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
530                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
531                                                 }
532                                         }
533                                         else
534                                         {
535                                                 // 2D texture with parent width == 1
536                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
537                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
538                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
539                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
540                                         }
541                                 }
542                         }
543                 }
544         }
545 }
546 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
547 {
548         DPSOFTRAST_Texture *texture;
549         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
550
551         // FIXME IMPLEMENT
552
553         dpsoftrast.errorstring = "DPSOFTRAST_Texture_UpdatePartial: Not implemented.";
554 }
555 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
556 {
557         DPSOFTRAST_Texture *texture;
558         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
559
560         memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
561         DPSOFTRAST_Texture_CalculateMipmaps(index);
562 }
563 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
564 {
565         DPSOFTRAST_Texture *texture;
566         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
567         return texture->mipmap[mip][2];
568 }
569 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
570 {
571         DPSOFTRAST_Texture *texture;
572         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
573         return texture->mipmap[mip][3];
574 }
575 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
576 {
577         DPSOFTRAST_Texture *texture;
578         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
579         return texture->mipmap[mip][4];
580 }
581 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
582 {
583         DPSOFTRAST_Texture *texture;
584         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
585         return texture->bytes + texture->mipmap[mip][0];
586 }
587 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
588 {
589         DPSOFTRAST_Texture *texture;
590         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
591         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
592         {
593                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
594                 return;
595         }
596         texture->filter = filter;
597 }
598
599 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
600 {
601         dpsoftrast.fb_width = width;
602         dpsoftrast.fb_height = height;
603         dpsoftrast.fb_depthpixels = depthpixels;
604         dpsoftrast.fb_colorpixels[0] = colorpixels0;
605         dpsoftrast.fb_colorpixels[1] = colorpixels1;
606         dpsoftrast.fb_colorpixels[2] = colorpixels2;
607         dpsoftrast.fb_colorpixels[3] = colorpixels3;
608 }
609 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
610 {
611         dpsoftrast.user.viewport[0] = x;
612         dpsoftrast.user.viewport[1] = y;
613         dpsoftrast.user.viewport[2] = width;
614         dpsoftrast.user.viewport[3] = height;
615         dpsoftrast.validate |= DPSOFTRAST_VALIDATE_FB;
616 }
617 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
618 {
619         int i, x1, y1, x2, y2, w, h, x, y;
620         unsigned int *p;
621         unsigned int c;
622         DPSOFTRAST_Validate(DPSOFTRAST_VALIDATE_FB);
623         x1 = dpsoftrast.fb_clearscissor[0];
624         y1 = dpsoftrast.fb_clearscissor[1];
625         x2 = dpsoftrast.fb_clearscissor[2];
626         y2 = dpsoftrast.fb_clearscissor[1] + dpsoftrast.fb_clearscissor[3];
627         w = x2 - x1;
628         h = y2 - y1;
629         if (w < 1 || h < 1)
630                 return;
631         // FIXME: honor dpsoftrast.fb_colormask?
632         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a);
633         for (i = 0;i < 4;i++)
634         {
635                 if (!dpsoftrast.fb_colorpixels[i])
636                         continue;
637                 for (y = y1;y < y2;y++)
638                 {
639                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
640                         for (x = x1;x < x2;x++)
641                                 p[x] = c;
642                 }
643         }
644 }
645 void DPSOFTRAST_ClearDepth(float d)
646 {
647         int x1, y1, x2, y2, w, h, x, y;
648         unsigned int *p;
649         unsigned int c;
650         DPSOFTRAST_Validate(DPSOFTRAST_VALIDATE_FB);
651         x1 = dpsoftrast.fb_clearscissor[0];
652         y1 = dpsoftrast.fb_clearscissor[1];
653         x2 = dpsoftrast.fb_clearscissor[2];
654         y2 = dpsoftrast.fb_clearscissor[1] + dpsoftrast.fb_clearscissor[3];
655         w = x2 - x1;
656         h = y2 - y1;
657         if (w < 1 || h < 1)
658                 return;
659         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d);
660         for (y = y1;y < y2;y++)
661         {
662                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
663                 for (x = x1;x < x2;x++)
664                         p[x] = c;
665         }
666 }
667 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
668 {
669         dpsoftrast.user.colormask[0] = r != 0;
670         dpsoftrast.user.colormask[1] = g != 0;
671         dpsoftrast.user.colormask[2] = b != 0;
672         dpsoftrast.user.colormask[3] = a != 0;
673         dpsoftrast.fb_colormask = ((-dpsoftrast.user.colormask[0]) & 0x00FF0000) | ((-dpsoftrast.user.colormask[1]) & 0x0000FF00) | ((-dpsoftrast.user.colormask[2]) & 0x000000FF) | ((-dpsoftrast.user.colormask[3]) & 0xFF000000);
674 }
675 void DPSOFTRAST_DepthTest(int enable)
676 {
677         dpsoftrast.user.depthtest = enable;
678         dpsoftrast.validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
679 }
680 void DPSOFTRAST_ScissorTest(int enable)
681 {
682         dpsoftrast.user.scissortest = enable;
683         dpsoftrast.validate |= DPSOFTRAST_VALIDATE_FB;
684 }
685 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
686 {
687         dpsoftrast.user.scissor[0] = x;
688         dpsoftrast.user.scissor[1] = y;
689         dpsoftrast.user.scissor[2] = width;
690         dpsoftrast.user.scissor[3] = height;
691         dpsoftrast.validate |= DPSOFTRAST_VALIDATE_FB;
692 }
693
694 void DPSOFTRAST_BlendFunc(int smodulate, int dmodulate)
695 {
696         // FIXME: validate
697         dpsoftrast.user.blendfunc[0] = smodulate;
698         dpsoftrast.user.blendfunc[1] = dmodulate;
699         dpsoftrast.validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
700 }
701 void DPSOFTRAST_BlendSubtract(int enable)
702 {
703         dpsoftrast.user.blendsubtract = enable != 0;
704         dpsoftrast.validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
705 }
706 void DPSOFTRAST_DepthMask(int enable)
707 {
708         dpsoftrast.user.depthmask = enable;
709 }
710 void DPSOFTRAST_DepthFunc(int comparemode)
711 {
712         // FIXME: validate
713         dpsoftrast.user.depthfunc = comparemode;
714 }
715 void DPSOFTRAST_DepthRange(float range0, float range1)
716 {
717         dpsoftrast.user.depthrange[0] = range0;
718         dpsoftrast.user.depthrange[1] = range1;
719 }
720 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
721 {
722         dpsoftrast.user.polygonoffset[0] = alongnormal;
723         dpsoftrast.user.polygonoffset[1] = intoview;
724 }
725 void DPSOFTRAST_CullFace(int mode)
726 {
727         // FIXME: validate
728         dpsoftrast.user.cullface = mode;
729 }
730 void DPSOFTRAST_AlphaTest(float enable)
731 {
732         dpsoftrast.user.alphatest = enable;
733 }
734 void DPSOFTRAST_AlphaFunc(int alphafunc, float alphavalue)
735 {
736         // FIXME: validate
737         dpsoftrast.user.alphafunc = alphafunc;
738         dpsoftrast.user.alphavalue = alphavalue;
739 }
740 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
741 {
742         dpsoftrast.user.color[0] = r;
743         dpsoftrast.user.color[1] = g;
744         dpsoftrast.user.color[2] = b;
745         dpsoftrast.user.color[3] = a;
746 }
747 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
748 {
749         int outstride = blockwidth * 4;
750         int instride = dpsoftrast.fb_width * 4;
751         int bx1 = blockx;
752         int by1 = blocky;
753         int bx2 = blockx + blockwidth;
754         int by2 = blocky + blockheight;
755         int bw;
756         int bh;
757         int x;
758         int y;
759         unsigned char *inpixels;
760         unsigned char *b;
761         unsigned char *o;
762         if (bx1 < 0) bx1 = 0;
763         if (by1 < 0) by1 = 0;
764         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
765         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
766         bw = bx2 - bx1;
767         bh = by2 - by1;
768         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
769         if (dpsoftrast.bigendian)
770         {
771                 for (y = by1;y < by2;y++)
772                 {
773                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
774                         o = (unsigned char *)outpixels + (y - by1) * outstride;
775                         for (x = bx1;x < bx2;x++)
776                         {
777                                 o[0] = b[3];
778                                 o[1] = b[2];
779                                 o[2] = b[1];
780                                 o[3] = b[0];
781                                 o += 4;
782                                 b += 4;
783                         }
784                 }
785         }
786         else
787         {
788                 for (y = by1;y < by2;y++)
789                 {
790                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
791                         o = (unsigned char *)outpixels + (y - by1) * outstride;
792                         memcpy(o, b, bw*4);
793                 }
794         }
795
796 }
797 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
798 {
799         int tx1 = tx;
800         int ty1 = ty;
801         int tx2 = tx + width;
802         int ty2 = ty + height;
803         int sx1 = sx;
804         int sy1 = sy;
805         int sx2 = sx + width;
806         int sy2 = sy + height;
807         int swidth;
808         int sheight;
809         int twidth;
810         int theight;
811         int sw;
812         int sh;
813         int tw;
814         int th;
815         int y;
816         unsigned int *spixels;
817         unsigned int *tpixels;
818         DPSOFTRAST_Texture *texture;
819         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
820         if (mip < 0 || mip >= texture->mipmaps) return;
821         spixels = dpsoftrast.fb_colorpixels[0];
822         swidth = dpsoftrast.fb_width;
823         sheight = dpsoftrast.fb_height;
824         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
825         twidth = texture->mipmap[mip][2];
826         theight = texture->mipmap[mip][3];
827         if (tx1 < 0) tx1 = 0;
828         if (ty1 < 0) ty1 = 0;
829         if (tx2 > twidth) tx2 = twidth;
830         if (ty2 > theight) ty2 = theight;
831         if (sx1 < 0) sx1 = 0;
832         if (sy1 < 0) sy1 = 0;
833         if (sx2 > swidth) sx2 = swidth;
834         if (sy2 > sheight) sy2 = sheight;
835         tw = tx2 - tx1;
836         th = ty2 - ty1;
837         sw = sx2 - sx1;
838         sh = sy2 - sy1;
839         if (tw > sw) tw = sw;
840         if (th > sh) th = sh;
841         if (tw < 1 || th < 1)
842                 return;
843         for (y = 0;y < th;y++)
844                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 + y) * swidth + sx1), tw*4);
845         if (texture->mipmaps > 1)
846                 DPSOFTRAST_Texture_CalculateMipmaps(index);
847 }
848 void DPSOFTRAST_SetTexture(int unitnum, int index)
849 {
850         DPSOFTRAST_Texture *texture;
851         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
852         {
853                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
854                 return;
855         }
856         texture = DPSOFTRAST_Texture_GetByIndex(index);
857         if (index && !texture)
858         {
859                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
860                 return;
861         }
862         dpsoftrast.texbound[unitnum] = texture;
863 }
864
865 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
866 {
867         dpsoftrast.pointer_vertex3f = vertex3f;
868         dpsoftrast.stride_vertex = stride;
869 }
870 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
871 {
872         dpsoftrast.pointer_color4f = color4f;
873         dpsoftrast.pointer_color4ub = NULL;
874         dpsoftrast.stride_color = stride;
875 }
876 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
877 {
878         dpsoftrast.pointer_color4f = NULL;
879         dpsoftrast.pointer_color4ub = color4ub;
880         dpsoftrast.stride_color = stride;
881 }
882 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
883 {
884         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
885         dpsoftrast.components_texcoord[unitnum] = numcomponents;
886         dpsoftrast.stride_texcoord[unitnum] = stride;
887 }
888
889 void DPSOFTRAST_SetShader(unsigned int mode, unsigned int permutation)
890 {
891         dpsoftrast.shader_mode = mode;
892         dpsoftrast.shader_permutation = permutation;
893 }
894 void DPSOFTRAST_Uniform4fARB(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
895 {
896         dpsoftrast.uniform4f[index*4+0] = v0;
897         dpsoftrast.uniform4f[index*4+1] = v1;
898         dpsoftrast.uniform4f[index*4+2] = v2;
899         dpsoftrast.uniform4f[index*4+3] = v3;
900 }
901 void DPSOFTRAST_Uniform4fvARB(DPSOFTRAST_UNIFORM index, const float *v)
902 {
903         dpsoftrast.uniform4f[index*4+0] = v[0];
904         dpsoftrast.uniform4f[index*4+1] = v[1];
905         dpsoftrast.uniform4f[index*4+2] = v[2];
906         dpsoftrast.uniform4f[index*4+3] = v[3];
907 }
908 void DPSOFTRAST_UniformMatrix4fvARB(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
909 {
910         int i, index;
911         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
912         {
913                 if (transpose)
914                 {
915                         dpsoftrast.uniform4f[index*4+0] = v[0];
916                         dpsoftrast.uniform4f[index*4+1] = v[4];
917                         dpsoftrast.uniform4f[index*4+2] = v[8];
918                         dpsoftrast.uniform4f[index*4+3] = v[12];
919                         dpsoftrast.uniform4f[index*4+4] = v[1];
920                         dpsoftrast.uniform4f[index*4+5] = v[5];
921                         dpsoftrast.uniform4f[index*4+6] = v[9];
922                         dpsoftrast.uniform4f[index*4+7] = v[13];
923                         dpsoftrast.uniform4f[index*4+8] = v[2];
924                         dpsoftrast.uniform4f[index*4+9] = v[6];
925                         dpsoftrast.uniform4f[index*4+10] = v[10];
926                         dpsoftrast.uniform4f[index*4+11] = v[14];
927                         dpsoftrast.uniform4f[index*4+12] = v[3];
928                         dpsoftrast.uniform4f[index*4+13] = v[7];
929                         dpsoftrast.uniform4f[index*4+14] = v[11];
930                         dpsoftrast.uniform4f[index*4+15] = v[15];
931                 }
932                 else
933                 {
934                         dpsoftrast.uniform4f[index*4+0] = v[0];
935                         dpsoftrast.uniform4f[index*4+1] = v[1];
936                         dpsoftrast.uniform4f[index*4+2] = v[2];
937                         dpsoftrast.uniform4f[index*4+3] = v[3];
938                         dpsoftrast.uniform4f[index*4+4] = v[4];
939                         dpsoftrast.uniform4f[index*4+5] = v[5];
940                         dpsoftrast.uniform4f[index*4+6] = v[6];
941                         dpsoftrast.uniform4f[index*4+7] = v[7];
942                         dpsoftrast.uniform4f[index*4+8] = v[8];
943                         dpsoftrast.uniform4f[index*4+9] = v[9];
944                         dpsoftrast.uniform4f[index*4+10] = v[10];
945                         dpsoftrast.uniform4f[index*4+11] = v[11];
946                         dpsoftrast.uniform4f[index*4+12] = v[12];
947                         dpsoftrast.uniform4f[index*4+13] = v[13];
948                         dpsoftrast.uniform4f[index*4+14] = v[14];
949                         dpsoftrast.uniform4f[index*4+15] = v[15];
950                 }
951         }
952 }
953 void DPSOFTRAST_Uniform1iARB(DPSOFTRAST_UNIFORM index, int i0)
954 {
955         dpsoftrast.uniform1i[index] = i0;
956 }
957
958 void DPSOFTRAST_Draw_LoadVertices(int firstvertex, int numvertices, bool needcolors)
959 {
960         int i;
961         int j;
962         int stride;
963         const float *v;
964         float *p;
965         float *data;
966         const unsigned char *b;
967         dpsoftrast.draw.numvertices = numvertices;
968         if (dpsoftrast.draw.maxvertices < dpsoftrast.draw.numvertices)
969         {
970                 if (dpsoftrast.draw.maxvertices < 4096)
971                         dpsoftrast.draw.maxvertices = 4096;
972                 while (dpsoftrast.draw.maxvertices < dpsoftrast.draw.numvertices)
973                         dpsoftrast.draw.maxvertices *= 2;
974                 if (dpsoftrast.draw.in_array4f[0])
975                         MM_FREE(dpsoftrast.draw.in_array4f[0]);
976                 data = (float *)MM_CALLOC(1, dpsoftrast.draw.maxvertices * sizeof(float[4])*(DPSOFTRAST_ARRAY_TOTAL*2 + 1));
977                 for (i = 0;i < DPSOFTRAST_ARRAY_TOTAL;i++, data += dpsoftrast.draw.maxvertices * 4)
978                         dpsoftrast.draw.in_array4f[i] = data;
979                 for (i = 0;i < DPSOFTRAST_ARRAY_TOTAL;i++, data += dpsoftrast.draw.maxvertices * 4)
980                         dpsoftrast.draw.post_array4f[i] = data;
981                 dpsoftrast.draw.screencoord4f = data;
982                 data += dpsoftrast.draw.maxvertices * 4;
983         }
984         stride = dpsoftrast.stride_vertex;
985         v = (const float *)((unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride);
986         p = dpsoftrast.draw.in_array4f[0];
987         for (i = 0;i < numvertices;i++)
988         {
989                 p[0] = v[0];
990                 p[1] = v[1];
991                 p[2] = v[2];
992                 p[3] = 1.0f;
993                 p += 4;
994                 v = (const float *)((const unsigned char *)v + stride);
995         }
996         if (needcolors)
997         {
998                 if (dpsoftrast.pointer_color4f)
999                 {
1000                         stride = dpsoftrast.stride_color;
1001                         v = (const float *)((const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride);
1002                         p = dpsoftrast.draw.in_array4f[1];
1003                         for (i = 0;i < numvertices;i++)
1004                         {
1005                                 p[0] = v[0];
1006                                 p[1] = v[1];
1007                                 p[2] = v[2];
1008                                 p[3] = v[3];
1009                                 p += 4;
1010                                 v = (const float *)((const unsigned char *)v + stride);
1011                         }
1012                 }
1013                 else if (dpsoftrast.pointer_color4ub)
1014                 {
1015                         stride = dpsoftrast.stride_color;
1016                         b = (const unsigned char *)((const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride);
1017                         p = dpsoftrast.draw.in_array4f[1];
1018                         for (i = 0;i < numvertices;i++)
1019                         {
1020                                 p[0] = b[0] * (1.0f / 255.0f);
1021                                 p[1] = b[1] * (1.0f / 255.0f);
1022                                 p[2] = b[2] * (1.0f / 255.0f);
1023                                 p[3] = b[3] * (1.0f / 255.0f);
1024                                 p += 4;
1025                                 b = (const unsigned char *)((const unsigned char *)b + stride);
1026                         }
1027                 }
1028                 else
1029                 {
1030                         v = dpsoftrast.user.color;
1031                         p = dpsoftrast.draw.in_array4f[1];
1032                         for (i = 0;i < numvertices;i++)
1033                         {
1034                                 p[0] = v[0];
1035                                 p[1] = v[1];
1036                                 p[2] = v[2];
1037                                 p[3] = v[3];
1038                                 p += 4;
1039                         }
1040                 }
1041         }
1042         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL-2;j++)
1043         {
1044                 if (dpsoftrast.pointer_texcoordf[j])
1045                 {
1046                         stride = dpsoftrast.stride_texcoord[j];
1047                         v = (const float *)((const unsigned char *)dpsoftrast.pointer_texcoordf[j] + firstvertex * stride);
1048                         p = dpsoftrast.draw.in_array4f[j+2];
1049                         switch(dpsoftrast.components_texcoord[j])
1050                         {
1051                         case 2:
1052                                 for (i = 0;i < numvertices;i++)
1053                                 {
1054                                         p[0] = v[0];
1055                                         p[1] = v[1];
1056                                         p[2] = 0.0f;
1057                                         p[3] = 1.0f;
1058                                         p += 4;
1059                                         v = (const float *)((const unsigned char *)v + stride);
1060                                 }
1061                                 break;
1062                         case 3:
1063                                 for (i = 0;i < numvertices;i++)
1064                                 {
1065                                         p[0] = v[0];
1066                                         p[1] = v[1];
1067                                         p[2] = v[2];
1068                                         p[3] = 1.0f;
1069                                         p += 4;
1070                                         v = (const float *)((const unsigned char *)v + stride);
1071                                 }
1072                                 break;
1073                         case 4:
1074                                 for (i = 0;i < numvertices;i++)
1075                                 {
1076                                         p[0] = v[0];
1077                                         p[1] = v[1];
1078                                         p[2] = v[2];
1079                                         p[3] = v[3];
1080                                         p += 4;
1081                                         v = (const float *)((const unsigned char *)v + stride);
1082                                 }
1083                                 break;
1084                         }
1085                 }
1086         }
1087 }
1088
1089 void DPSOFTRAST_Array_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1090 {
1091         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1092         // TODO: SIMD
1093         float matrix[4][4];
1094         int i;
1095         memcpy(matrix, inmatrix16f, sizeof(float[16]));
1096         if (!memcmp(identitymatrix, matrix, sizeof(float[16])))
1097         {
1098                 // fast case for identity matrix
1099                 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1100                 return;
1101         }
1102         for (i = 0;i < numitems;i++, out4f += 4, in4f += 4)
1103         {
1104                 out4f[0] = in4f[0] * matrix[0][0] + in4f[1] * matrix[1][0] + in4f[2] * matrix[2][0] + in4f[3] * matrix[3][0];
1105                 out4f[1] = in4f[0] * matrix[0][1] + in4f[1] * matrix[1][1] + in4f[2] * matrix[2][1] + in4f[3] * matrix[3][1];
1106                 out4f[2] = in4f[0] * matrix[0][2] + in4f[1] * matrix[1][2] + in4f[2] * matrix[2][2] + in4f[3] * matrix[3][2];
1107                 out4f[3] = in4f[0] * matrix[0][3] + in4f[1] * matrix[1][3] + in4f[2] * matrix[2][3] + in4f[3] * matrix[3][3];
1108         }
1109 }
1110
1111 void DPSOFTRAST_Array_Copy(float *out4f, const float *in4f, int numitems)
1112 {
1113         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1114 }
1115
1116 #ifdef SSE2_PRESENT
1117 static __m128 DPSOFTRAST_Draw_ProjectVertex(__m128 v)
1118 {
1119         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1120         __m128 w = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3));
1121         v = _mm_move_ss(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set1_ps(1.0f));
1122         v = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, v), w));
1123         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1124         return v;
1125 }
1126 #endif
1127
1128 void DPSOFTRAST_Draw_ProjectVertices(float *out4f, const float *in4f, int numitems)
1129 {
1130 #ifdef SSE2_PRESENT
1131         // NOTE: this is used both as a whole mesh transform function and a
1132         // per-triangle transform function (for clipped triangles), accordingly
1133         // it should not crash on divide by 0 but the result of divide by 0 is
1134         // unimportant...
1135         // TODO: SIMD
1136         int i;
1137         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1138         for (i = 0;i < numitems;i++)
1139         {
1140                 __m128 v = _mm_load_ps(in4f), w = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3));
1141                 v = _mm_move_ss(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set1_ps(1.0f));
1142                 v = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, v), w));
1143                 _mm_store_ps(out4f, _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1)));
1144                 in4f += 4;
1145                 out4f += 4;
1146         }
1147 #endif
1148 }
1149
1150 void DPSOFTRAST_Draw_DebugEdgePoints(const float *screen0, const float *screen1)
1151 {
1152         int i;
1153         int x;
1154         int y;
1155         int w = dpsoftrast.fb_width;
1156         int bounds[4];
1157         float v0[2], v1[2];
1158         unsigned int *pixels = dpsoftrast.fb_colorpixels[0];
1159         //const float *c4f;
1160         bounds[0] = dpsoftrast.fb_viewportscissor[0];
1161         bounds[1] = dpsoftrast.fb_viewportscissor[1];
1162         bounds[2] = dpsoftrast.fb_viewportscissor[0] + dpsoftrast.fb_viewportscissor[2];
1163         bounds[3] = dpsoftrast.fb_viewportscissor[1] + dpsoftrast.fb_viewportscissor[3];
1164         v0[0] = screen0[0];
1165         v0[1] = screen0[1];
1166         v1[0] = screen1[0];
1167         v1[1] = screen1[1];
1168         for (i = 0;i <= 128;i++)
1169         {
1170                 // check nearclip
1171                 //if (dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+3] != 1.0f)
1172                 //      continue;
1173                 x = (int)(v0[0] + (v1[0] - v0[0]) * (i/128.0f));
1174                 y = (int)(v0[1] + (v1[1] - v0[1]) * (i/128.0f));
1175                 if (x < bounds[0] || y < bounds[1] || x >= bounds[2] || y >= bounds[3])
1176                         continue;
1177                 //c4f = dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_COLOR] + element0*4;
1178                 //pixels[y*w+x] = DPSOFTRAST_BGRA8_FROM_RGBA32F(c4f[0], c4f[1], c4f[2], c4f[3]);
1179                 pixels[y*w+x] = 0xFFFFFFFF;
1180         }
1181 }
1182
1183 void DPSOFTRAST_Draw_Span_Begin(const DPSOFTRAST_State_Draw_Span * RESTRICT span, float *zf)
1184 {
1185         int x;
1186         int startx = span->startx;
1187         int endx = span->endx;
1188         float w = span->data[0][DPSOFTRAST_ARRAY_TOTAL][3];
1189         float wslope = span->data[1][DPSOFTRAST_ARRAY_TOTAL][3];
1190         float endz = 1.0f / (w + wslope * startx);
1191         for (x = startx;x < endx;)
1192         {
1193                 int nextsub = x + DPSOFTRAST_MAXSUBSPAN, endsub = nextsub - 1;
1194                 float z = endz, dz;
1195                 if(nextsub >= endx) nextsub = endsub = endx-1;
1196                 endz = 1.0f / (w + wslope * nextsub);
1197                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1198                 for (; x <= endsub; x++, z += dz)
1199                         zf[x] = z;
1200         }
1201 }
1202
1203 void DPSOFTRAST_Draw_Span_Finish(const DPSOFTRAST_State_Draw_Span * RESTRICT span, const float * RESTRICT in4f)
1204 {
1205         int x;
1206         int startx = span->startx;
1207         int endx = span->endx;
1208         int d[4];
1209         float a, b;
1210         unsigned char * RESTRICT pixelmask = span->pixelmask;
1211         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1212         if (!pixel)
1213                 return;
1214         pixel += span->start * 4;
1215         // handle alphatest now (this affects depth writes too)
1216         if (dpsoftrast.user.alphatest)
1217                 for (x = startx;x < endx;x++)
1218                         if (in4f[x*4+3] < 0.5f)
1219                                 pixelmask[x] = false;
1220         // FIXME: this does not handle bigendian
1221         switch(dpsoftrast.fb_blendmode)
1222         {
1223         case DPSOFTRAST_BLENDMODE_OPAQUE:
1224                 for (x = startx;x < endx;x++)
1225                 {
1226                         if (!pixelmask[x])
1227                                 continue;
1228                         d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
1229                         d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
1230                         d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
1231                         d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
1232                         pixel[x*4+0] = d[0];
1233                         pixel[x*4+1] = d[1];
1234                         pixel[x*4+2] = d[2];
1235                         pixel[x*4+3] = d[3];
1236                 }
1237                 break;
1238         case DPSOFTRAST_BLENDMODE_ALPHA:
1239                 for (x = startx;x < endx;x++)
1240                 {
1241                         if (!pixelmask[x])
1242                                 continue;
1243                         a = in4f[x*4+3] * 255.0f;
1244                         b = 1.0f - in4f[x*4+3];
1245                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
1246                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
1247                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
1248                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
1249                         pixel[x*4+0] = d[0];
1250                         pixel[x*4+1] = d[1];
1251                         pixel[x*4+2] = d[2];
1252                         pixel[x*4+3] = d[3];
1253                 }
1254                 break;
1255         case DPSOFTRAST_BLENDMODE_ADDALPHA:
1256                 for (x = startx;x < endx;x++)
1257                 {
1258                         if (!pixelmask[x])
1259                                 continue;
1260                         a = in4f[x*4+3] * 255.0f;
1261                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
1262                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
1263                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
1264                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
1265                         pixel[x*4+0] = d[0];
1266                         pixel[x*4+1] = d[1];
1267                         pixel[x*4+2] = d[2];
1268                         pixel[x*4+3] = d[3];
1269                 }
1270                 break;
1271         case DPSOFTRAST_BLENDMODE_ADD:
1272                 for (x = startx;x < endx;x++)
1273                 {
1274                         if (!pixelmask[x])
1275                                 continue;
1276                         d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
1277                         d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
1278                         d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
1279                         d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
1280                         pixel[x*4+0] = d[0];
1281                         pixel[x*4+1] = d[1];
1282                         pixel[x*4+2] = d[2];
1283                         pixel[x*4+3] = d[3];
1284                 }
1285                 break;
1286         case DPSOFTRAST_BLENDMODE_INVMOD:
1287                 for (x = startx;x < endx;x++)
1288                 {
1289                         if (!pixelmask[x])
1290                                 continue;
1291                         d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
1292                         d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
1293                         d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
1294                         d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
1295                         pixel[x*4+0] = d[0];
1296                         pixel[x*4+1] = d[1];
1297                         pixel[x*4+2] = d[2];
1298                         pixel[x*4+3] = d[3];
1299                 }
1300                 break;
1301         case DPSOFTRAST_BLENDMODE_MUL:
1302                 for (x = startx;x < endx;x++)
1303                 {
1304                         if (!pixelmask[x])
1305                                 continue;
1306                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
1307                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
1308                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
1309                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
1310                         pixel[x*4+0] = d[0];
1311                         pixel[x*4+1] = d[1];
1312                         pixel[x*4+2] = d[2];
1313                         pixel[x*4+3] = d[3];
1314                 }
1315                 break;
1316         case DPSOFTRAST_BLENDMODE_MUL2:
1317                 for (x = startx;x < endx;x++)
1318                 {
1319                         if (!pixelmask[x])
1320                                 continue;
1321                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
1322                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
1323                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
1324                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
1325                         pixel[x*4+0] = d[0];
1326                         pixel[x*4+1] = d[1];
1327                         pixel[x*4+2] = d[2];
1328                         pixel[x*4+3] = d[3];
1329                 }
1330                 break;
1331         case DPSOFTRAST_BLENDMODE_SUBALPHA:
1332                 for (x = startx;x < endx;x++)
1333                 {
1334                         if (!pixelmask[x])
1335                                 continue;
1336                         a = in4f[x*4+3] * -255.0f;
1337                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
1338                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
1339                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
1340                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
1341                         pixel[x*4+0] = d[0];
1342                         pixel[x*4+1] = d[1];
1343                         pixel[x*4+2] = d[2];
1344                         pixel[x*4+3] = d[3];
1345                 }
1346                 break;
1347         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
1348                 for (x = startx;x < endx;x++)
1349                 {
1350                         if (!pixelmask[x])
1351                                 continue;
1352                         a = 255.0f;
1353                         b = 1.0f - in4f[x*4+3];
1354                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
1355                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
1356                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
1357                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
1358                         pixel[x*4+0] = d[0];
1359                         pixel[x*4+1] = d[1];
1360                         pixel[x*4+2] = d[2];
1361                         pixel[x*4+3] = d[3];
1362                 }
1363                 break;
1364         }
1365 }
1366
1367 void DPSOFTRAST_Draw_Span_FinishBGRA8(const DPSOFTRAST_State_Draw_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
1368 {
1369 #ifdef SSE2_PRESENT
1370         int x;
1371         int startx = span->startx;
1372         int endx = span->endx;
1373         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
1374         unsigned char * RESTRICT pixelmask = span->pixelmask;
1375         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1376         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
1377         if (!pixel)
1378                 return;
1379         pixel += span->start * 4;
1380         pixeli += span->start;
1381         // handle alphatest now (this affects depth writes too)
1382         if (dpsoftrast.user.alphatest)
1383                 for (x = startx;x < endx;x++)
1384                         if (in4ub[x*4+3] < 0.5f)
1385                                 pixelmask[x] = false;
1386         // FIXME: this does not handle bigendian
1387         switch(dpsoftrast.fb_blendmode)
1388         {
1389         case DPSOFTRAST_BLENDMODE_OPAQUE:
1390                 for (x = startx;x + 4 <= endx;)
1391                 {
1392                         if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
1393                         {
1394                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
1395                                 x += 4;
1396                         }
1397                         else
1398                         {
1399                                 if (pixelmask[x])
1400                                         pixeli[x] = ini[x];
1401                                 x++;
1402                         }
1403                 }
1404                 for (;x < endx;x++)
1405                         if (pixelmask[x])
1406                                 pixeli[x] = ini[x];
1407                 break;
1408         case DPSOFTRAST_BLENDMODE_ALPHA:
1409         #define FINISHBLEND(blend2, blend1) \
1410                 for (x = startx;x + 2 <= endx;x += 2) \
1411                 { \
1412                         __m128i src, dst; \
1413                         switch (*(const unsigned short*)&pixelmask[x]) \
1414                         { \
1415                         case 0x0101: \
1416                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
1417                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
1418                                 blend2; \
1419                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
1420                                 continue; \
1421                         case 0x0100: \
1422                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
1423                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
1424                                 blend1; \
1425                                 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst));  \
1426                                 continue; \
1427                         case 0x0001: \
1428                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
1429                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
1430                                 blend1; \
1431                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
1432                                 continue; \
1433                         } \
1434                         break; \
1435                 } \
1436                 for(;x < endx; x++) \
1437                 { \
1438                         __m128i src, dst; \
1439                         if (!pixelmask[x]) \
1440                                 continue; \
1441                         src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
1442                         dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
1443                         blend1; \
1444                         pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
1445                 }
1446
1447                 FINISHBLEND({
1448                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
1449                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
1450                 }, {
1451                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
1452                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
1453                 });
1454                 break;
1455         case DPSOFTRAST_BLENDMODE_ADDALPHA:
1456                 FINISHBLEND({
1457                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
1458                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
1459                 }, {
1460                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
1461                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
1462                 });
1463                 break;
1464         case DPSOFTRAST_BLENDMODE_ADD:
1465                 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
1466                 break;
1467         case DPSOFTRAST_BLENDMODE_INVMOD:
1468                 FINISHBLEND({
1469                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
1470                 }, {
1471                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
1472                 });
1473                 break;
1474         case DPSOFTRAST_BLENDMODE_MUL:
1475                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
1476                 break;
1477         case DPSOFTRAST_BLENDMODE_MUL2:
1478                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
1479                 break;
1480         case DPSOFTRAST_BLENDMODE_SUBALPHA:
1481                 FINISHBLEND({
1482                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
1483                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
1484                 }, {
1485                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
1486                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
1487                 });
1488                 break;
1489         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
1490                 FINISHBLEND({
1491                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
1492                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
1493                 }, {
1494                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
1495                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
1496                 });
1497                 break;
1498         }
1499 #endif
1500 }
1501
1502 void DPSOFTRAST_Draw_Span_Texture2DVarying(const DPSOFTRAST_State_Draw_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
1503 {
1504         int x;
1505         int startx = span->startx;
1506         int endx = span->endx;
1507         int flags;
1508         float c[4];
1509         float data[4];
1510         float slope[4];
1511         float tc[2], endtc[2];
1512         float tcscale[2];
1513         unsigned int tci[2];
1514         unsigned int tci1[2];
1515         unsigned int tcimin[2];
1516         unsigned int tcimax[2];
1517         int tciwrapmask[2];
1518         int tciwidth;
1519         int filter;
1520         int mip;
1521         const unsigned char * RESTRICT pixelbase;
1522         const unsigned char * RESTRICT pixel[4];
1523         DPSOFTRAST_Texture *texture = dpsoftrast.texbound[texunitindex];
1524         // if no texture is bound, just fill it with white
1525         if (!texture)
1526         {
1527                 for (x = startx;x < endx;x++)
1528                 {
1529                         out4f[x*4+0] = 1.0f;
1530                         out4f[x*4+1] = 1.0f;
1531                         out4f[x*4+2] = 1.0f;
1532                         out4f[x*4+3] = 1.0f;
1533                 }
1534                 return;
1535         }
1536         mip = span->mip[texunitindex];
1537         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
1538         // if this mipmap of the texture is 1 pixel, just fill it with that color
1539         if (texture->mipmap[mip][1] == 4)
1540         {
1541                 c[0] = texture->bytes[2] * (1.0f/255.0f);
1542                 c[1] = texture->bytes[1] * (1.0f/255.0f);
1543                 c[2] = texture->bytes[0] * (1.0f/255.0f);
1544                 c[3] = texture->bytes[3] * (1.0f/255.0f);
1545                 for (x = startx;x < endx;x++)
1546                 {
1547                         out4f[x*4+0] = c[0];
1548                         out4f[x*4+1] = c[1];
1549                         out4f[x*4+2] = c[2];
1550                         out4f[x*4+3] = c[3];
1551                 }
1552                 return;
1553         }
1554         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
1555         data[0] = span->data[0][arrayindex][0];
1556         data[1] = span->data[0][arrayindex][1];
1557         data[2] = span->data[0][arrayindex][2];
1558         data[3] = span->data[0][arrayindex][3];
1559         slope[0] = span->data[1][arrayindex][0];
1560         slope[1] = span->data[1][arrayindex][1];
1561         slope[2] = span->data[1][arrayindex][2];
1562         slope[3] = span->data[1][arrayindex][3];
1563         flags = texture->flags;
1564         tcscale[0] = texture->mipmap[mip][2];
1565         tcscale[1] = texture->mipmap[mip][3];
1566         tciwidth = texture->mipmap[mip][2];
1567         tcimin[0] = 0;
1568         tcimin[1] = 0;
1569         tcimax[0] = texture->mipmap[mip][2]-1;
1570         tcimax[1] = texture->mipmap[mip][3]-1;
1571         tciwrapmask[0] = texture->mipmap[mip][2]-1;
1572         tciwrapmask[1] = texture->mipmap[mip][3]-1;
1573         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
1574         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
1575         for (x = startx;x < endx;)
1576         {
1577                 unsigned int subtc[2];
1578                 unsigned int substep[2];
1579                 float subscale = 65536.0f/DPSOFTRAST_MAXSUBSPAN;
1580                 int nextsub = x + DPSOFTRAST_MAXSUBSPAN, endsub = nextsub - 1;
1581                 if(nextsub >= endx)
1582                 {
1583                         nextsub = endsub = endx-1;      
1584                         if(x < nextsub) subscale = 65536.0f / (nextsub - x);
1585                 }
1586                 tc[0] = endtc[0];
1587                 tc[1] = endtc[1];
1588                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
1589                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
1590                 substep[0] = (endtc[0] - tc[0]) * subscale;
1591                 substep[1] = (endtc[1] - tc[1]) * subscale;
1592                 subtc[0] = tc[0] * (1<<16);
1593                 subtc[1] = tc[1] * (1<<16);
1594                 if(filter)
1595                 {
1596                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
1597                         {
1598                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
1599                                 {
1600                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
1601                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
1602                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
1603                                         tci[0] = subtc[0]>>16;
1604                                         tci[1] = subtc[1]>>16;
1605                                         tci1[0] = tci[0] + 1;
1606                                         tci1[1] = tci[1] + 1;
1607                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
1608                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
1609                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
1610                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
1611                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
1612                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
1613                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
1614                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
1615                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
1616                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
1617                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
1618                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
1619                                         out4f[x*4+0] = c[0];
1620                                         out4f[x*4+1] = c[1];
1621                                         out4f[x*4+2] = c[2];
1622                                         out4f[x*4+3] = c[3];
1623                                 }
1624                         }
1625                         else
1626                         {
1627                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
1628                                 {
1629                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
1630                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
1631                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
1632                                         tci[0] = subtc[0]>>16;
1633                                         tci[1] = subtc[1]>>16;
1634                                         tci1[0] = tci[0] + 1;
1635                                         tci1[1] = tci[1] + 1;
1636                                         tci[0] &= tciwrapmask[0];
1637                                         tci[1] &= tciwrapmask[1];
1638                                         tci1[0] &= tciwrapmask[0];
1639                                         tci1[1] &= tciwrapmask[1];
1640                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
1641                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
1642                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
1643                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
1644                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
1645                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
1646                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
1647                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
1648                                         out4f[x*4+0] = c[0];
1649                                         out4f[x*4+1] = c[1];
1650                                         out4f[x*4+2] = c[2];
1651                                         out4f[x*4+3] = c[3];
1652                                 }
1653                         }
1654                 }
1655                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
1656                 {
1657                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
1658                         {
1659                                 tci[0] = subtc[0]>>16;
1660                                 tci[1] = subtc[1]>>16;
1661                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
1662                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
1663                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
1664                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
1665                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
1666                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
1667                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
1668                                 out4f[x*4+0] = c[0];
1669                                 out4f[x*4+1] = c[1];
1670                                 out4f[x*4+2] = c[2];
1671                                 out4f[x*4+3] = c[3];
1672                         }
1673                 }
1674                 else
1675                 {
1676                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
1677                         {
1678                                 tci[0] = subtc[0]>>16;
1679                                 tci[1] = subtc[1]>>16;
1680                                 tci[0] &= tciwrapmask[0];
1681                                 tci[1] &= tciwrapmask[1];
1682                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
1683                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
1684                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
1685                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
1686                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
1687                                 out4f[x*4+0] = c[0];
1688                                 out4f[x*4+1] = c[1];
1689                                 out4f[x*4+2] = c[2];
1690                                 out4f[x*4+3] = c[3];
1691                         }
1692                 }
1693         }
1694 }
1695
1696 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(const DPSOFTRAST_State_Draw_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
1697 {
1698 #ifdef SSE2_PRESENT
1699         int x;
1700         int startx = span->startx;
1701         int endx = span->endx;
1702         int flags;
1703         __m128 data, slope, tcscale;
1704         __m128i tcsize, tcmask, tcoffset, tcmax;
1705         __m128 tc, endtc;
1706         __m128i subtc, substep, endsubtc;
1707         int filter;
1708         int mip;
1709         unsigned int *outi = (unsigned int *)out4ub;
1710         const unsigned char * RESTRICT pixelbase;
1711         DPSOFTRAST_Texture *texture = dpsoftrast.texbound[texunitindex];
1712         // if no texture is bound, just fill it with white
1713         if (!texture)
1714         {
1715                 memset(out4ub + startx*4, 255, span->length*4);
1716                 return;
1717         }
1718         mip = span->mip[texunitindex];
1719         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
1720         // if this mipmap of the texture is 1 pixel, just fill it with that color
1721         if (texture->mipmap[mip][1] == 4)
1722         {
1723                 unsigned int k = *((const unsigned int *)pixelbase);
1724                 for (x = startx;x < endx;x++)
1725                         outi[x] = k;
1726                 return;
1727         }
1728         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
1729         data = _mm_load_ps(span->data[0][arrayindex]);
1730         slope = _mm_load_ps(span->data[1][arrayindex]);
1731         flags = texture->flags;
1732         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
1733         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
1734         tcscale = _mm_cvtepi32_ps(tcsize);
1735         data = _mm_mul_ps(_mm_shuffle_ps(data, data, _MM_SHUFFLE(1, 0, 1, 0)), tcscale);
1736         slope = _mm_mul_ps(_mm_shuffle_ps(slope, slope, _MM_SHUFFLE(1, 0, 1, 0)), tcscale);
1737         endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
1738         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
1739         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
1740         tcmax = filter ? _mm_packs_epi32(tcmask, tcmask) : _mm_slli_epi32(tcmask, 16);  
1741         for (x = startx;x < endx;)
1742         {
1743                 int nextsub = x + DPSOFTRAST_MAXSUBSPAN, endsub = nextsub - 1;
1744                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_MAXSUBSPAN);
1745                 if(nextsub >= endx)
1746                 {
1747                         nextsub = endsub = endx-1;
1748                         if(x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
1749                 }       
1750                 tc = endtc;
1751                 subtc = endsubtc;
1752                 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
1753                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
1754                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
1755                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
1756                 substep = _mm_slli_epi32(substep, 1);
1757                 if (filter)
1758                 {
1759                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
1760                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
1761                         {
1762                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
1763                                 {
1764                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
1765                                         tci = _mm_madd_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 0x10000, 0, 0x10000)), tcoffset);
1766                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[_mm_cvtsi128_si32(tci)]), _mm_setzero_si128());
1767                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))]), _mm_setzero_si128());
1768                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), _mm_setzero_si128());
1769                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))]), _mm_setzero_si128());
1770                                         fracm = _mm_srli_epi16(subtc, 1);
1771                                         pix1 = _mm_add_epi16(pix1,
1772                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
1773                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
1774                                         pix3 = _mm_add_epi16(pix3,
1775                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
1776                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
1777                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
1778                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
1779                                         pix2 = _mm_add_epi16(pix2,
1780                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
1781                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
1782                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
1783                                 }
1784                                 if (x <= endsub)
1785                                 {
1786                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
1787                                         tci = _mm_madd_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 0x10000, 0, 0)), tcoffset);
1788                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[_mm_cvtsi128_si32(tci)]), _mm_setzero_si128());
1789                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))]), _mm_setzero_si128());
1790                                         fracm = _mm_srli_epi16(subtc, 1);
1791                                         pix1 = _mm_add_epi16(pix1,
1792                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
1793                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
1794                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
1795                                         pix1 = _mm_add_epi16(pix1,
1796                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
1797                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
1798                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
1799                                         x++;
1800                                 }
1801                         }
1802                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
1803                         {
1804                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
1805                                 {
1806                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
1807                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
1808                                         tci = _mm_madd_epi16(tci, tcoffset);
1809                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
1810                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
1811                                                                                         _mm_setzero_si128());
1812                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
1813                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
1814                                                                                         _mm_setzero_si128());
1815                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
1816                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
1817                                         tci = _mm_madd_epi16(tci, tcoffset);
1818                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
1819                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
1820                                                                                         _mm_setzero_si128());
1821                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
1822                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
1823                                                                                         _mm_setzero_si128());
1824                                         fracm = _mm_srli_epi16(subtc, 1);
1825                                         pix1 = _mm_add_epi16(pix1,
1826                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
1827                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
1828                                         pix3 = _mm_add_epi16(pix3,
1829                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
1830                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
1831                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
1832                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
1833                                         pix2 = _mm_add_epi16(pix2,
1834                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
1835                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
1836                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
1837                                 }
1838                                 if (x <= endsub)
1839                                 {
1840                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
1841                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
1842                                         tci = _mm_madd_epi16(tci, tcoffset);
1843                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
1844                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
1845                                                                                         _mm_setzero_si128());
1846                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
1847                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
1848                                                                                         _mm_setzero_si128());
1849                                         fracm = _mm_srli_epi16(subtc, 1);
1850                                         pix1 = _mm_add_epi16(pix1,
1851                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
1852                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
1853                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
1854                                         pix1 = _mm_add_epi16(pix1,
1855                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
1856                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
1857                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
1858                                         x++;
1859                                 }
1860                         }
1861                         else
1862                         {
1863                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
1864                                 {
1865                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
1866                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
1867                                         tci = _mm_madd_epi16(tci, tcoffset);
1868                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
1869                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
1870                                                                                         _mm_setzero_si128());
1871                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
1872                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
1873                                                                                         _mm_setzero_si128());
1874                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
1875                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
1876                                         tci = _mm_madd_epi16(tci, tcoffset);
1877                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
1878                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
1879                                                                                         _mm_setzero_si128());
1880                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
1881                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
1882                                                                                         _mm_setzero_si128());
1883                                         fracm = _mm_srli_epi16(subtc, 1);
1884                                         pix1 = _mm_add_epi16(pix1,
1885                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
1886                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
1887                                         pix3 = _mm_add_epi16(pix3,
1888                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
1889                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
1890                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
1891                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
1892                                         pix2 = _mm_add_epi16(pix2,
1893                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
1894                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
1895                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
1896                                 }
1897                                 if (x <= endsub)
1898                                 {
1899                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
1900                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
1901                                         tci = _mm_madd_epi16(tci, tcoffset);
1902                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
1903                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
1904                                                                                         _mm_setzero_si128());
1905                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
1906                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
1907                                                                                         _mm_setzero_si128());
1908                                         fracm = _mm_srli_epi16(subtc, 1);
1909                                         pix1 = _mm_add_epi16(pix1,
1910                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
1911                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
1912                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
1913                                         pix1 = _mm_add_epi16(pix1,
1914                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
1915                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
1916                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
1917                                         x++;
1918                                 }
1919                         }
1920                 }
1921                 else
1922                 {
1923                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
1924                         {
1925                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
1926                                 {
1927                                         __m128i tci = _mm_min_epi16(_mm_max_epi16(subtc, _mm_setzero_si128()), tcmax); 
1928                                         tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(tci, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
1929                                         tci = _mm_madd_epi16(tci, tcoffset);
1930                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
1931                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))];
1932                                 }
1933                                 if (x <= endsub)
1934                                 {
1935                                         __m128i tci = _mm_min_epi16(_mm_max_epi16(subtc, _mm_setzero_si128()), tcmax);
1936                                         tci = _mm_shufflelo_epi16(tci, _MM_SHUFFLE(3, 1, 3, 1));
1937                                         tci = _mm_madd_epi16(tci, tcoffset);
1938                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
1939                                         x++;
1940                                 }
1941                         }
1942                         else
1943                         {
1944                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
1945                                 {
1946                                         __m128i tci = _mm_and_si128(subtc, tcmax); 
1947                                         tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(tci, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
1948                                         tci = _mm_madd_epi16(tci, tcoffset);
1949                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
1950                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))];
1951                                 }
1952                                 if (x <= endsub)
1953                                 {
1954                                         __m128i tci = _mm_and_si128(subtc, tcmax); 
1955                                         tci = _mm_shufflelo_epi16(tci, _MM_SHUFFLE(3, 1, 3, 1));
1956                                         tci = _mm_madd_epi16(tci, tcoffset);
1957                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
1958                                         x++;
1959                                 }
1960                         }
1961                 }
1962         }
1963 #endif
1964 }
1965
1966 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Draw_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
1967 {
1968         // TODO: IMPLEMENT
1969         memset(out4ub, 255, span->length*4);
1970 }
1971
1972 float DPSOFTRAST_SampleShadowmap(const float *vector)
1973 {
1974         // TODO: IMPLEMENT
1975         return 1.0f;
1976 }
1977
1978 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Draw_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
1979 {
1980         int x;
1981         int startx = span->startx;
1982         int endx = span->endx;
1983         float c[4];
1984         float data[4];
1985         float slope[4];
1986         float z;
1987         data[0] = span->data[0][arrayindex][0];
1988         data[1] = span->data[0][arrayindex][1];
1989         data[2] = span->data[0][arrayindex][2];
1990         data[3] = span->data[0][arrayindex][3];
1991         slope[0] = span->data[1][arrayindex][0];
1992         slope[1] = span->data[1][arrayindex][1];
1993         slope[2] = span->data[1][arrayindex][2];
1994         slope[3] = span->data[1][arrayindex][3];
1995         for (x = startx;x < endx;x++)
1996         {
1997                 z = zf[x];
1998                 c[0] = (data[0] + slope[0]*x) * z;
1999                 c[1] = (data[1] + slope[1]*x) * z;
2000                 c[2] = (data[2] + slope[2]*x) * z;
2001                 c[3] = (data[3] + slope[3]*x) * z;
2002                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2003                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2004                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2005                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2006         }
2007 }
2008
2009 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Draw_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2010 {
2011         int x;
2012         int startx = span->startx;
2013         int endx = span->endx;
2014         float c[4];
2015         float data[4];
2016         float slope[4];
2017         float z;
2018         data[0] = span->data[0][arrayindex][0];
2019         data[1] = span->data[0][arrayindex][1];
2020         data[2] = span->data[0][arrayindex][2];
2021         data[3] = span->data[0][arrayindex][3];
2022         slope[0] = span->data[1][arrayindex][0];
2023         slope[1] = span->data[1][arrayindex][1];
2024         slope[2] = span->data[1][arrayindex][2];
2025         slope[3] = span->data[1][arrayindex][3];
2026         for (x = startx;x < endx;x++)
2027         {
2028                 z = zf[x];
2029                 c[0] = (data[0] + slope[0]*x) * z;
2030                 c[1] = (data[1] + slope[1]*x) * z;
2031                 c[2] = (data[2] + slope[2]*x) * z;
2032                 c[3] = (data[3] + slope[3]*x) * z;
2033                 out4f[x*4+0] = c[0];
2034                 out4f[x*4+1] = c[1];
2035                 out4f[x*4+2] = c[2];
2036                 out4f[x*4+3] = c[3];
2037         }
2038 }
2039
2040 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Draw_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2041 {
2042         int x, startx = span->startx, endx = span->endx;
2043         float c[4], localcolor[4];
2044         localcolor[0] = subcolor[0];
2045         localcolor[1] = subcolor[1];
2046         localcolor[2] = subcolor[2];
2047         localcolor[3] = subcolor[3];
2048         for (x = startx;x < endx;x++)
2049         {
2050                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2051                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2052                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2053                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2054                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2055                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2056                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2057                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2058         }
2059 }
2060
2061 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Draw_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2062 {
2063         int x, startx = span->startx, endx = span->endx;
2064         for (x = startx;x < endx;x++)
2065         {
2066                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2067                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2068                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2069                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2070         }
2071 }
2072
2073 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Draw_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2074 {
2075         int x, startx = span->startx, endx = span->endx;
2076         for (x = startx;x < endx;x++)
2077         {
2078                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2079                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2080                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2081                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2082         }
2083 }
2084
2085 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Draw_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2086 {
2087         int x, startx = span->startx, endx = span->endx;
2088         float a, b;
2089         for (x = startx;x < endx;x++)
2090         {
2091                 a = 1.0f - inb4f[x*4+3];
2092                 b = inb4f[x*4+3];
2093                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2094                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2095                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2096                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2097         }
2098 }
2099
2100 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Draw_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2101 {
2102         int x, startx = span->startx, endx = span->endx;
2103         float localcolor[4], ilerp, lerp;
2104         localcolor[0] = color[0];
2105         localcolor[1] = color[1];
2106         localcolor[2] = color[2];
2107         localcolor[3] = color[3];
2108         ilerp = 1.0f - localcolor[3];
2109         lerp = localcolor[3];
2110         for (x = startx;x < endx;x++)
2111         {
2112                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2113                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2114                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2115                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2116         }
2117 }
2118
2119
2120
2121 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Draw_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2122 {
2123 #ifdef SSE2_PRESENT
2124         int x;
2125         int startx = span->startx;
2126         int endx = span->endx;
2127         __m128 data = _mm_load_ps(span->data[0][arrayindex]), slope = _mm_load_ps(span->data[1][arrayindex]);
2128         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2129         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2130         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
2131         data = _mm_mul_ps(data, _mm_set1_ps(256.0f));
2132         slope = _mm_mul_ps(slope, _mm_set1_ps(256.0f));
2133         for (x = startx;x+2 <= endx;x += 2, data = _mm_add_ps(data, slope))
2134         {
2135                 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2136                 __m128i mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x]))), mod2;
2137                 data = _mm_add_ps(data, slope);
2138                 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x+1])));
2139                 mod = _mm_unpacklo_epi64(_mm_packs_epi32(mod, mod), _mm_packs_epi32(mod2, mod2));
2140                 pix = _mm_mulhi_epu16(pix, mod);
2141                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2142         }
2143         for (;x < endx;x++, data = _mm_add_ps(data, slope))
2144         {
2145                 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2146                 __m128i mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x])));
2147                 mod = _mm_packs_epi32(mod, mod);
2148                 pix = _mm_mulhi_epu16(pix, mod);
2149                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2150         }
2151 #endif
2152 }
2153
2154 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Draw_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2155 {
2156 #ifdef SSE2_PRESENT
2157         int x;
2158         int startx = span->startx;
2159         int endx = span->endx;
2160         __m128 data = _mm_load_ps(span->data[0][arrayindex]), slope = _mm_load_ps(span->data[1][arrayindex]);
2161         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2162         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2163         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
2164         data = _mm_mul_ps(data, _mm_set1_ps(255.0f));
2165         slope = _mm_mul_ps(slope, _mm_set1_ps(255.0f));
2166         for (x = startx;x+2 <= endx;x += 2, data = _mm_add_ps(data, slope))
2167         {
2168                 __m128i pix = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x]))), pix2;
2169                 data = _mm_add_ps(data, slope);
2170                 pix2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x+1])));
2171                 pix = _mm_unpacklo_epi64(_mm_packs_epi32(pix, pix), _mm_packs_epi32(pix2, pix2));
2172                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2173         }
2174         for (;x < endx;x++, data = _mm_add_ps(data, slope))
2175         {
2176                 __m128i pix = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x])));
2177                 pix = _mm_packs_epi32(pix, pix);
2178                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2179         }
2180 #endif
2181 }
2182
2183 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Draw_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
2184 {
2185 #ifdef SSE2_PRESENT
2186         int x, startx = span->startx, endx = span->endx;
2187         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
2188         localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
2189         for (x = startx;x+2 <= endx;x+=2)
2190         {
2191                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2192                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2193                 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2194                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2195         }
2196         if(x < endx)
2197         {
2198                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2199                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
2200                 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2201                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2202         }
2203 #endif
2204 }
2205
2206 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Draw_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
2207 {
2208 #ifdef SSE2_PRESENT
2209         int x, startx = span->startx, endx = span->endx;
2210         for (x = startx;x+2 <= endx;x+=2)
2211         {
2212                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2213                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
2214                 pix1 = _mm_mulhi_epu16(pix1, pix2);
2215                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2216         }
2217         if(x < endx)
2218         {
2219                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2220                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
2221                 pix1 = _mm_mulhi_epu16(pix1, pix2);
2222                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2223         }
2224 #endif
2225 }
2226
2227 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Draw_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
2228 {
2229 #ifdef SSE2_PRESENT
2230         int x, startx = span->startx, endx = span->endx;
2231         for (x = startx;x+2 <= endx;x+=2)
2232         {
2233                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2234                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2235                 pix1 = _mm_add_epi16(pix1, pix2);
2236                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2237         }
2238         if(x < endx)
2239         {
2240                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2241                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
2242                 pix1 = _mm_add_epi16(pix1, pix2);
2243                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2244         }
2245 #endif
2246 }
2247
2248 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Draw_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
2249 {
2250 #ifdef SSE2_PRESENT
2251         int x, startx = span->startx, endx = span->endx;
2252         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
2253         tint = _mm_shuffle_epi32(_mm_packs_epi32(tint, tint), _MM_SHUFFLE(1, 0, 1, 0));
2254         for (x = startx;x+2 <= endx;x+=2)
2255         {
2256                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2257                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
2258                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
2259                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2260         }
2261         if(x < endx)
2262         {
2263                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2264                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
2265                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
2266                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2267         }
2268 #endif
2269 }
2270
2271 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Draw_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
2272 {
2273 #ifdef SSE2_PRESENT
2274         int x, startx = span->startx, endx = span->endx;
2275         for (x = startx;x+2 <= endx;x+=2)
2276         {
2277                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2278                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2279                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2280                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
2281                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2282         }
2283         if(x < endx)
2284         {
2285                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2286                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
2287                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
2288                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
2289                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2290         }
2291 #endif
2292 }
2293
2294 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Draw_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
2295 {
2296 #ifdef SSE2_PRESENT
2297         int x, startx = span->startx, endx = span->endx;
2298         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
2299         localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
2300         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
2301         for (x = startx;x+2 <= endx;x+=2)
2302         {
2303                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
2304                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
2305                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2306         }
2307         if(x < endx)
2308         {
2309                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
2310                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
2311                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2312         }
2313 #endif
2314 }
2315
2316
2317
2318 void DPSOFTRAST_VertexShader_Generic(void)
2319 {
2320         DPSOFTRAST_Array_Transform(dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.draw.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
2321         DPSOFTRAST_Array_Copy(dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_COLOR], dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_COLOR], dpsoftrast.draw.numvertices);
2322         DPSOFTRAST_Array_Copy(dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.draw.numvertices);
2323         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
2324                 DPSOFTRAST_Array_Copy(dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1], dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1], dpsoftrast.draw.numvertices);
2325 }
2326
2327 void DPSOFTRAST_PixelShader_Generic(const DPSOFTRAST_State_Draw_Span * RESTRICT span)
2328 {
2329         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
2330         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
2331         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
2332         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
2333         DPSOFTRAST_Draw_Span_Begin(span, buffer_z);
2334         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_DIFFUSE)
2335         {
2336                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
2337                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
2338                 if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
2339                 {
2340                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
2341                         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_COLORMAPPING)
2342                         {
2343                                 // multiply
2344                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
2345                         }
2346                         else if (dpsoftrast.shader_permutation & SHADERPERMUTATION_COLORMAPPING)
2347                         {
2348                                 // add
2349                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
2350                         }
2351                         else if (dpsoftrast.shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
2352                         {
2353                                 // alphablend
2354                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
2355                         }
2356                 }
2357         }
2358         else
2359                 DPSOFTRAST_Draw_Span_VaryingBGRA8(span, buffer_FragColorbgra8, 1, buffer_z);
2360         DPSOFTRAST_Draw_Span_FinishBGRA8(span, buffer_FragColorbgra8);
2361 }
2362
2363
2364
2365 void DPSOFTRAST_VertexShader_PostProcess(void)
2366 {
2367         DPSOFTRAST_Array_Transform(dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.draw.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
2368         DPSOFTRAST_Array_Copy(dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.draw.numvertices);
2369         DPSOFTRAST_Array_Copy(dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1], dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1], dpsoftrast.draw.numvertices);
2370 }
2371
2372 void DPSOFTRAST_PixelShader_PostProcess(const DPSOFTRAST_State_Draw_Span * RESTRICT span)
2373 {
2374         // TODO: optimize!!  at the very least there is no reason to use texture sampling on the frame texture
2375         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
2376         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
2377         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
2378         DPSOFTRAST_Draw_Span_Begin(span, buffer_z);
2379         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
2380         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_BLOOM)
2381         {
2382                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
2383                 DPSOFTRAST_Draw_Span_AddBloomBGRA8(span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, dpsoftrast.uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
2384         }
2385         DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(span, buffer_FragColorbgra8, buffer_FragColorbgra8, dpsoftrast.uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
2386         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SATURATION)
2387         {
2388                 // TODO: implement saturation
2389         }
2390         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
2391         {
2392                 // TODO: implement gammaramps
2393         }
2394         DPSOFTRAST_Draw_Span_FinishBGRA8(span, buffer_FragColorbgra8);
2395 }
2396
2397
2398
2399 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
2400 {
2401         DPSOFTRAST_Array_Transform(dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.draw.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
2402 }
2403
2404 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(const DPSOFTRAST_State_Draw_Span * RESTRICT span)
2405 {
2406         // this is never called (because colormask is off when this shader is used)
2407         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
2408         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
2409         DPSOFTRAST_Draw_Span_Begin(span, buffer_z);
2410         memset(buffer_FragColorbgra8, 0, span->length*4);
2411         DPSOFTRAST_Draw_Span_FinishBGRA8(span, buffer_FragColorbgra8);
2412 }
2413
2414
2415
2416 void DPSOFTRAST_VertexShader_FlatColor(void)
2417 {
2418         DPSOFTRAST_Array_Transform(dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.draw.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
2419         DPSOFTRAST_Array_Transform(dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.draw.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
2420 }
2421
2422 void DPSOFTRAST_PixelShader_FlatColor(const DPSOFTRAST_State_Draw_Span * RESTRICT span)
2423 {
2424         int x, startx = span->startx, endx = span->endx;
2425         int Color_Ambienti[4];
2426         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
2427         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
2428         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
2429         Color_Ambienti[2] = (int)(dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0]*256.0f);
2430         Color_Ambienti[1] = (int)(dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1]*256.0f);
2431         Color_Ambienti[0] = (int)(dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2]*256.0f);
2432         Color_Ambienti[3] = (int)(dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]        *256.0f);
2433         DPSOFTRAST_Draw_Span_Begin(span, buffer_z);
2434         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
2435         for (x = startx;x < endx;x++)
2436         {
2437                 buffer_FragColorbgra8[x*4+0] = (buffer_texture_colorbgra8[x*4+0] * Color_Ambienti[0])>>8;
2438                 buffer_FragColorbgra8[x*4+1] = (buffer_texture_colorbgra8[x*4+1] * Color_Ambienti[1])>>8;
2439                 buffer_FragColorbgra8[x*4+2] = (buffer_texture_colorbgra8[x*4+2] * Color_Ambienti[2])>>8;
2440                 buffer_FragColorbgra8[x*4+3] = (buffer_texture_colorbgra8[x*4+3] * Color_Ambienti[3])>>8;
2441         }
2442         DPSOFTRAST_Draw_Span_FinishBGRA8(span, buffer_FragColorbgra8);
2443 }
2444
2445
2446
2447 void DPSOFTRAST_VertexShader_VertexColor(void)
2448 {
2449         DPSOFTRAST_Array_Transform(dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.draw.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
2450         DPSOFTRAST_Array_Copy(dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_COLOR], dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_COLOR], dpsoftrast.draw.numvertices);
2451         DPSOFTRAST_Array_Transform(dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.draw.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
2452 }
2453
2454 void DPSOFTRAST_PixelShader_VertexColor(const DPSOFTRAST_State_Draw_Span * RESTRICT span)
2455 {
2456 #ifdef SSE2_PRESENT
2457         unsigned char * RESTRICT pixelmask = span->pixelmask;
2458         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + span->start * 4;
2459         int x, startx = span->startx, endx = span->endx;
2460         __m128i Color_Ambientm, Color_Diffusem;
2461         __m128 data, slope;
2462         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
2463         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
2464         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
2465         int arrayindex = DPSOFTRAST_ARRAY_COLOR;
2466         DPSOFTRAST_Draw_Span_Begin(span, buffer_z);
2467         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
2468         if (dpsoftrast.user.alphatest || dpsoftrast.fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
2469                 pixel = buffer_FragColorbgra8;
2470         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
2471         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
2472         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
2473         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
2474         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
2475         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
2476         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
2477         data = _mm_load_ps(span->data[0][arrayindex]); 
2478         slope = _mm_load_ps(span->data[1][arrayindex]);
2479         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2480         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2481         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
2482         data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
2483         slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
2484         for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
2485         {
2486                 __m128i color, mod, pix;
2487                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
2488                 {
2489                         __m128i pix2, mod2;
2490                         __m128 z = _mm_loadu_ps(&buffer_z[x]);
2491                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
2492                         mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
2493                         data = _mm_add_ps(data, slope);
2494                         mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
2495                         data = _mm_add_ps(data, slope);
2496                         mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
2497                         data = _mm_add_ps(data, slope);
2498                         mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
2499                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
2500                                                                   _mm_unpacklo_epi8(_mm_setzero_si128(), color));
2501                         pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
2502                                                                    _mm_unpackhi_epi8(_mm_setzero_si128(), color));
2503                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
2504                         x += 3;
2505                         continue;
2506                 }
2507                 if(!pixelmask[x])
2508                         continue;
2509                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
2510                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
2511                 mod = _mm_packs_epi32(mod, mod);
2512                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
2513                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2514         }
2515         if(pixel == buffer_FragColorbgra8)
2516                 DPSOFTRAST_Draw_Span_FinishBGRA8(span, buffer_FragColorbgra8);
2517 #endif
2518 }
2519
2520
2521
2522 void DPSOFTRAST_VertexShader_Lightmap(void)
2523 {
2524         DPSOFTRAST_Array_Transform(dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.draw.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
2525         DPSOFTRAST_Array_Transform(dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.draw.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
2526         DPSOFTRAST_Array_Copy(dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD4], dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD4], dpsoftrast.draw.numvertices);
2527 }
2528
2529 void DPSOFTRAST_PixelShader_Lightmap(const DPSOFTRAST_State_Draw_Span * RESTRICT span)
2530 {
2531 #ifdef SSE2_PRESENT
2532         unsigned char * RESTRICT pixelmask = span->pixelmask;
2533         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + span->start * 4;
2534         int x, startx = span->startx, endx = span->endx;
2535         __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
2536         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
2537         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
2538         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
2539         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
2540         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
2541         DPSOFTRAST_Draw_Span_Begin(span, buffer_z);
2542         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
2543         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
2544         if (dpsoftrast.user.alphatest || dpsoftrast.fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
2545                 pixel = buffer_FragColorbgra8;
2546         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
2547         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
2548         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
2549         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
2550         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
2551         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
2552         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
2553         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_GLOW)
2554         {
2555                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
2556                 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
2557                 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
2558                 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
2559                 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
2560                 for (x = startx;x < endx;x++)
2561                 {
2562                         __m128i color, lightmap, glow, pix;
2563                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
2564                         {
2565                                 __m128i pix2;
2566                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
2567                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
2568                                 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
2569                                 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
2570                                                                                                         _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
2571                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
2572                                 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
2573                                                                                                         _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
2574                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
2575                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
2576                                 x += 3;
2577                                 continue;
2578                         }
2579                         if(!pixelmask[x])
2580                                 continue;
2581                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
2582                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
2583                         glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
2584                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
2585                         pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
2586                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2587                 }
2588         }
2589         else
2590         {
2591                 for (x = startx;x < endx;x++)
2592                 {
2593                         __m128i color, lightmap, pix;
2594                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
2595                         {
2596                                 __m128i pix2;
2597                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
2598                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
2599                                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
2600                                                                           _mm_unpacklo_epi8(_mm_setzero_si128(), color));
2601                                 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
2602                                                                            _mm_unpackhi_epi8(_mm_setzero_si128(), color));
2603                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
2604                                 x += 3;
2605                                 continue;
2606                         }
2607                         if(!pixelmask[x]) 
2608                                 continue;
2609                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
2610                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
2611                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
2612                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2613                 }
2614         }
2615         if(pixel == buffer_FragColorbgra8)
2616                 DPSOFTRAST_Draw_Span_FinishBGRA8(span, buffer_FragColorbgra8);
2617 #endif
2618 }
2619
2620
2621
2622 void DPSOFTRAST_VertexShader_FakeLight(void)
2623 {
2624         DPSOFTRAST_Array_Transform(dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.draw.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
2625 }
2626
2627 void DPSOFTRAST_PixelShader_FakeLight(const DPSOFTRAST_State_Draw_Span * RESTRICT span)
2628 {
2629         // TODO: IMPLEMENT
2630         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
2631         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
2632         DPSOFTRAST_Draw_Span_Begin(span, buffer_z);
2633         memset(buffer_FragColorbgra8, 0, span->length*4);
2634         DPSOFTRAST_Draw_Span_FinishBGRA8(span, buffer_FragColorbgra8);
2635 }
2636
2637
2638
2639 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
2640 {
2641         DPSOFTRAST_VertexShader_Lightmap();
2642 }
2643
2644 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(const DPSOFTRAST_State_Draw_Span * RESTRICT span)
2645 {
2646         DPSOFTRAST_PixelShader_Lightmap(span);
2647         // TODO: IMPLEMENT
2648 }
2649
2650
2651
2652 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
2653 {
2654         DPSOFTRAST_VertexShader_Lightmap();
2655 }
2656
2657 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(const DPSOFTRAST_State_Draw_Span * RESTRICT span)
2658 {
2659         DPSOFTRAST_PixelShader_Lightmap(span);
2660         // TODO: IMPLEMENT
2661 }
2662
2663
2664
2665 void DPSOFTRAST_VertexShader_LightDirection(void)
2666 {
2667         int i;
2668         int numvertices = dpsoftrast.draw.numvertices;
2669         float LightDir[4];
2670         float LightVector[4];
2671         float EyePosition[4];
2672         float EyeVectorModelSpace[4];
2673         float EyeVector[4];
2674         float position[4];
2675         float svector[4];
2676         float tvector[4];
2677         float normal[4];
2678         LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
2679         LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
2680         LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
2681         LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
2682         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
2683         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
2684         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
2685         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
2686         DPSOFTRAST_Array_Transform(dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.draw.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
2687         DPSOFTRAST_Array_Transform(dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.draw.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
2688         for (i = 0;i < numvertices;i++)
2689         {
2690                 position[0] = dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
2691                 position[1] = dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
2692                 position[2] = dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
2693                 svector[0] = dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
2694                 svector[1] = dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
2695                 svector[2] = dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
2696                 tvector[0] = dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
2697                 tvector[1] = dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
2698                 tvector[2] = dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
2699                 normal[0] = dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
2700                 normal[1] = dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
2701                 normal[2] = dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
2702                 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
2703                 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
2704                 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
2705                 dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
2706                 dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
2707                 dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
2708                 dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
2709                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
2710                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
2711                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
2712                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
2713                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
2714                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
2715                 dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
2716                 dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
2717                 dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
2718                 dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
2719         }
2720 }
2721
2722 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
2723 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
2724 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
2725 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
2726 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
2727 #define DPSOFTRAST_Vector3Normalize(v)\
2728 do\
2729 {\
2730         float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
2731         if (len)\
2732         {\
2733                 len = 1.0f / len;\
2734                 v[0] *= len;\
2735                 v[1] *= len;\
2736                 v[2] *= len;\
2737         }\
2738 }\
2739 while(0)
2740
2741 void DPSOFTRAST_PixelShader_LightDirection(const DPSOFTRAST_State_Draw_Span * RESTRICT span)
2742 {
2743         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
2744         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
2745         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
2746         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
2747         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
2748         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
2749         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
2750         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
2751         int x, startx = span->startx, endx = span->endx;
2752         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
2753         float LightVectordata[4];
2754         float LightVectorslope[4];
2755         float EyeVectordata[4];
2756         float EyeVectorslope[4];
2757         float z;
2758         float diffusetex[4];
2759         float glosstex[4];
2760         float surfacenormal[4];
2761         float lightnormal[4];
2762         float eyenormal[4];
2763         float specularnormal[4];
2764         float diffuse;
2765         float specular;
2766         float SpecularPower;
2767         int d[4];
2768         Color_Glow[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
2769         Color_Glow[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
2770         Color_Glow[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
2771         Color_Glow[3] = 0.0f;
2772         Color_Ambient[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
2773         Color_Ambient[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
2774         Color_Ambient[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
2775         Color_Ambient[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
2776         Color_Pants[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
2777         Color_Pants[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
2778         Color_Pants[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
2779         Color_Pants[3] = 0.0f;
2780         Color_Shirt[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
2781         Color_Shirt[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
2782         Color_Shirt[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
2783         Color_Shirt[3] = 0.0f;
2784         DPSOFTRAST_Draw_Span_Begin(span, buffer_z);
2785         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
2786         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_COLORMAPPING)
2787         {
2788                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
2789                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
2790         }
2791         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_GLOW)
2792         {
2793                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
2794         }
2795         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
2796         {
2797                 Color_Diffuse[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
2798                 Color_Diffuse[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
2799                 Color_Diffuse[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
2800                 Color_Diffuse[3] = 0.0f;
2801                 LightColor[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
2802                 LightColor[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
2803                 LightColor[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
2804                 LightColor[3] = 0.0f;
2805                 LightVectordata[0]  = span->data[0][DPSOFTRAST_ARRAY_TEXCOORD1][0];
2806                 LightVectordata[1]  = span->data[0][DPSOFTRAST_ARRAY_TEXCOORD1][1];
2807                 LightVectordata[2]  = span->data[0][DPSOFTRAST_ARRAY_TEXCOORD1][2];
2808                 LightVectordata[3]  = span->data[0][DPSOFTRAST_ARRAY_TEXCOORD1][3];
2809                 LightVectorslope[0] = span->data[1][DPSOFTRAST_ARRAY_TEXCOORD1][0];
2810                 LightVectorslope[1] = span->data[1][DPSOFTRAST_ARRAY_TEXCOORD1][1];
2811                 LightVectorslope[2] = span->data[1][DPSOFTRAST_ARRAY_TEXCOORD1][2];
2812                 LightVectorslope[3] = span->data[1][DPSOFTRAST_ARRAY_TEXCOORD1][3];
2813                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
2814                 Color_Specular[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
2815                 Color_Specular[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
2816                 Color_Specular[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
2817                 Color_Specular[3] = 0.0f;
2818                 SpecularPower = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
2819                 EyeVectordata[0]    = span->data[0][DPSOFTRAST_ARRAY_TEXCOORD2][0];
2820                 EyeVectordata[1]    = span->data[0][DPSOFTRAST_ARRAY_TEXCOORD2][1];
2821                 EyeVectordata[2]    = span->data[0][DPSOFTRAST_ARRAY_TEXCOORD2][2];
2822                 EyeVectordata[3]    = span->data[0][DPSOFTRAST_ARRAY_TEXCOORD2][3];
2823                 EyeVectorslope[0]   = span->data[1][DPSOFTRAST_ARRAY_TEXCOORD2][0];
2824                 EyeVectorslope[1]   = span->data[1][DPSOFTRAST_ARRAY_TEXCOORD2][1];
2825                 EyeVectorslope[2]   = span->data[1][DPSOFTRAST_ARRAY_TEXCOORD2][2];
2826                 EyeVectorslope[3]   = span->data[1][DPSOFTRAST_ARRAY_TEXCOORD2][3];
2827                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
2828                 for (x = startx;x < endx;x++)
2829                 {
2830                         z = buffer_z[x];
2831                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
2832                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
2833                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
2834                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
2835                         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_COLORMAPPING)
2836                         {
2837                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
2838                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
2839                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
2840                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
2841                         }
2842                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
2843                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
2844                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
2845                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
2846                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
2847                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
2848                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
2849                         DPSOFTRAST_Vector3Normalize(surfacenormal);
2850
2851                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
2852                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
2853                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
2854                         DPSOFTRAST_Vector3Normalize(lightnormal);
2855
2856                         eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
2857                         eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
2858                         eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
2859                         DPSOFTRAST_Vector3Normalize(eyenormal);
2860
2861                         specularnormal[0] = lightnormal[0] + eyenormal[0];
2862                         specularnormal[1] = lightnormal[1] + eyenormal[1];
2863                         specularnormal[2] = lightnormal[2] + eyenormal[2];
2864                         DPSOFTRAST_Vector3Normalize(specularnormal);
2865
2866                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
2867                         specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
2868                         specular = pow(specular, SpecularPower * glosstex[3]);
2869                         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_GLOW)
2870                         {
2871                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
2872                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
2873                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
2874                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
2875                         }
2876                         else
2877                         {
2878                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
2879                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
2880                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
2881                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
2882                         }
2883                         buffer_FragColorbgra8[x*4+0] = d[0];
2884                         buffer_FragColorbgra8[x*4+1] = d[1];
2885                         buffer_FragColorbgra8[x*4+2] = d[2];
2886                         buffer_FragColorbgra8[x*4+3] = d[3];
2887                 }
2888         }
2889         else if (dpsoftrast.shader_permutation & SHADERPERMUTATION_DIFFUSE)
2890         {
2891                 Color_Diffuse[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
2892                 Color_Diffuse[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
2893                 Color_Diffuse[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
2894                 Color_Diffuse[3] = 0.0f;
2895                 LightColor[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
2896                 LightColor[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
2897                 LightColor[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
2898                 LightColor[3] = 0.0f;
2899                 LightVectordata[0]  = span->data[0][DPSOFTRAST_ARRAY_TEXCOORD1][0];
2900                 LightVectordata[1]  = span->data[0][DPSOFTRAST_ARRAY_TEXCOORD1][1];
2901                 LightVectordata[2]  = span->data[0][DPSOFTRAST_ARRAY_TEXCOORD1][2];
2902                 LightVectordata[3]  = span->data[0][DPSOFTRAST_ARRAY_TEXCOORD1][3];
2903                 LightVectorslope[0] = span->data[1][DPSOFTRAST_ARRAY_TEXCOORD1][0];
2904                 LightVectorslope[1] = span->data[1][DPSOFTRAST_ARRAY_TEXCOORD1][1];
2905                 LightVectorslope[2] = span->data[1][DPSOFTRAST_ARRAY_TEXCOORD1][2];
2906                 LightVectorslope[3] = span->data[1][DPSOFTRAST_ARRAY_TEXCOORD1][3];
2907                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
2908                 for (x = startx;x < endx;x++)
2909                 {
2910                         z = buffer_z[x];
2911                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
2912                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
2913                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
2914                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
2915                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
2916                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
2917                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
2918                         DPSOFTRAST_Vector3Normalize(surfacenormal);
2919
2920                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
2921                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
2922                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
2923                         DPSOFTRAST_Vector3Normalize(lightnormal);
2924
2925                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
2926                         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_GLOW)
2927                         {
2928                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
2929                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
2930                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
2931                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
2932                         }
2933                         else
2934                         {
2935                                 d[0] = (int)(                                                + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
2936                                 d[1] = (int)(                                                + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
2937                                 d[2] = (int)(                                                + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
2938                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
2939                         }
2940                         buffer_FragColorbgra8[x*4+0] = d[0];
2941                         buffer_FragColorbgra8[x*4+1] = d[1];
2942                         buffer_FragColorbgra8[x*4+2] = d[2];
2943                         buffer_FragColorbgra8[x*4+3] = d[3];
2944                 }
2945         }
2946         else
2947         {
2948                 for (x = startx;x < endx;x++)
2949                 {
2950                         z = buffer_z[x];
2951                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
2952                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
2953                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
2954                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
2955
2956                         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_GLOW)
2957                         {
2958                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
2959                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
2960                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
2961                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
2962                         }
2963                         else
2964                         {
2965                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
2966                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
2967                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
2968                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
2969                         }
2970                         buffer_FragColorbgra8[x*4+0] = d[0];
2971                         buffer_FragColorbgra8[x*4+1] = d[1];
2972                         buffer_FragColorbgra8[x*4+2] = d[2];
2973                         buffer_FragColorbgra8[x*4+3] = d[3];
2974                 }
2975         }
2976         DPSOFTRAST_Draw_Span_FinishBGRA8(span, buffer_FragColorbgra8);
2977 }
2978
2979
2980
2981 void DPSOFTRAST_VertexShader_LightSource(void)
2982 {
2983         int i;
2984         int numvertices = dpsoftrast.draw.numvertices;
2985         float LightPosition[4];
2986         float LightVector[4];
2987         float LightVectorModelSpace[4];
2988         float EyePosition[4];
2989         float EyeVectorModelSpace[4];
2990         float EyeVector[4];
2991         float position[4];
2992         float svector[4];
2993         float tvector[4];
2994         float normal[4];
2995         LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
2996         LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
2997         LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
2998         LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
2999         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3000         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3001         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3002         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3003         DPSOFTRAST_Array_Transform(dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.draw.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3004         DPSOFTRAST_Array_Transform(dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.draw.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3005         DPSOFTRAST_Array_Transform(dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3], dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.draw.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
3006         DPSOFTRAST_Array_Copy(dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD4], dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD4], dpsoftrast.draw.numvertices);
3007         for (i = 0;i < numvertices;i++)
3008         {
3009                 position[0] = dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3010                 position[1] = dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3011                 position[2] = dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3012                 svector[0] = dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3013                 svector[1] = dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3014                 svector[2] = dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3015                 tvector[0] = dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3016                 tvector[1] = dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3017                 tvector[2] = dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3018                 normal[0] = dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3019                 normal[1] = dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3020                 normal[2] = dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3021                 LightVectorModelSpace[0] = LightPosition[0] - position[0];
3022                 LightVectorModelSpace[1] = LightPosition[1] - position[1];
3023                 LightVectorModelSpace[2] = LightPosition[2] - position[2];
3024                 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
3025                 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
3026                 LightVector[2] = normal[0]  * LightVectorModelSpace[0] + normal[1]  * LightVectorModelSpace[1] + normal[2]  * LightVectorModelSpace[2];
3027                 dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3028                 dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3029                 dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3030                 dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3031                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3032                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3033                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3034                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3035                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3036                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3037                 dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3038                 dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3039                 dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3040                 dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3041         }
3042 }
3043
3044 void DPSOFTRAST_PixelShader_LightSource(const DPSOFTRAST_State_Draw_Span * RESTRICT span)
3045 {
3046         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3047         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3048         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3049         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3050         unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3051         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3052         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3053         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3054         int x, startx = span->startx, endx = span->endx;
3055         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3056         float CubeVectordata[4];
3057         float CubeVectorslope[4];
3058         float LightVectordata[4];
3059         float LightVectorslope[4];
3060         float EyeVectordata[4];
3061         float EyeVectorslope[4];
3062         float z;
3063         float diffusetex[4];
3064         float glosstex[4];
3065         float surfacenormal[4];
3066         float lightnormal[4];
3067         float eyenormal[4];
3068         float specularnormal[4];
3069         float diffuse;
3070         float specular;
3071         float SpecularPower;
3072         float CubeVector[4];
3073         float attenuation;
3074         int d[4];
3075         Color_Glow[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3076         Color_Glow[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3077         Color_Glow[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3078         Color_Glow[3] = 0.0f;
3079         Color_Ambient[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3080         Color_Ambient[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3081         Color_Ambient[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3082         Color_Ambient[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3083         Color_Diffuse[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3084         Color_Diffuse[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3085         Color_Diffuse[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3086         Color_Diffuse[3] = 0.0f;
3087         Color_Specular[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3088         Color_Specular[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3089         Color_Specular[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3090         Color_Specular[3] = 0.0f;
3091         Color_Pants[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3092         Color_Pants[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3093         Color_Pants[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3094         Color_Pants[3] = 0.0f;
3095         Color_Shirt[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3096         Color_Shirt[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3097         Color_Shirt[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3098         Color_Shirt[3] = 0.0f;
3099         LightColor[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3100         LightColor[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3101         LightColor[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3102         LightColor[3] = 0.0f;
3103         SpecularPower = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3104         EyeVectordata[0]    = span->data[0][DPSOFTRAST_ARRAY_TEXCOORD2][0];
3105         EyeVectordata[1]    = span->data[0][DPSOFTRAST_ARRAY_TEXCOORD2][1];
3106         EyeVectordata[2]    = span->data[0][DPSOFTRAST_ARRAY_TEXCOORD2][2];
3107         EyeVectordata[3]    = span->data[0][DPSOFTRAST_ARRAY_TEXCOORD2][3];
3108         EyeVectorslope[0]   = span->data[1][DPSOFTRAST_ARRAY_TEXCOORD2][0];
3109         EyeVectorslope[1]   = span->data[1][DPSOFTRAST_ARRAY_TEXCOORD2][1];
3110         EyeVectorslope[2]   = span->data[1][DPSOFTRAST_ARRAY_TEXCOORD2][2];
3111         EyeVectorslope[3]   = span->data[1][DPSOFTRAST_ARRAY_TEXCOORD2][3];
3112         LightVectordata[0]  = span->data[0][DPSOFTRAST_ARRAY_TEXCOORD1][0];
3113         LightVectordata[1]  = span->data[0][DPSOFTRAST_ARRAY_TEXCOORD1][1];
3114         LightVectordata[2]  = span->data[0][DPSOFTRAST_ARRAY_TEXCOORD1][2];
3115         LightVectordata[3]  = span->data[0][DPSOFTRAST_ARRAY_TEXCOORD1][3];
3116         LightVectorslope[0] = span->data[1][DPSOFTRAST_ARRAY_TEXCOORD1][0];
3117         LightVectorslope[1] = span->data[1][DPSOFTRAST_ARRAY_TEXCOORD1][1];
3118         LightVectorslope[2] = span->data[1][DPSOFTRAST_ARRAY_TEXCOORD1][2];
3119         LightVectorslope[3] = span->data[1][DPSOFTRAST_ARRAY_TEXCOORD1][3];
3120         CubeVectordata[0]  = span->data[0][DPSOFTRAST_ARRAY_TEXCOORD3][0];
3121         CubeVectordata[1]  = span->data[0][DPSOFTRAST_ARRAY_TEXCOORD3][1];
3122         CubeVectordata[2]  = span->data[0][DPSOFTRAST_ARRAY_TEXCOORD3][2];
3123         CubeVectordata[3]  = span->data[0][DPSOFTRAST_ARRAY_TEXCOORD3][3];
3124         CubeVectorslope[0] = span->data[1][DPSOFTRAST_ARRAY_TEXCOORD3][0];
3125         CubeVectorslope[1] = span->data[1][DPSOFTRAST_ARRAY_TEXCOORD3][1];
3126         CubeVectorslope[2] = span->data[1][DPSOFTRAST_ARRAY_TEXCOORD3][2];
3127         CubeVectorslope[3] = span->data[1][DPSOFTRAST_ARRAY_TEXCOORD3][3];
3128         DPSOFTRAST_Draw_Span_Begin(span, buffer_z);
3129         memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
3130         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3131         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3132         {
3133                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3134                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3135         }
3136         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3137                 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
3138         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3139         {
3140                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3141                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3142                 for (x = startx;x < endx;x++)
3143                 {
3144                         z = buffer_z[x];
3145                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3146                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3147                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3148                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3149                         if (attenuation < 0.01f)
3150                                 continue;
3151                         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3152                         {
3153                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3154                                 if (attenuation < 0.01f)
3155                                         continue;
3156                         }
3157
3158                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3159                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3160                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3161                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3162                         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3163                         {
3164                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3165                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3166                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3167                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3168                         }
3169                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3170                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3171                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3172                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3173                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3174                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3175                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3176                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3177
3178                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3179                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3180                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3181                         DPSOFTRAST_Vector3Normalize(lightnormal);
3182
3183                         eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3184                         eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3185                         eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3186                         DPSOFTRAST_Vector3Normalize(eyenormal);
3187
3188                         specularnormal[0] = lightnormal[0] + eyenormal[0];
3189                         specularnormal[1] = lightnormal[1] + eyenormal[1];
3190                         specularnormal[2] = lightnormal[2] + eyenormal[2];
3191                         DPSOFTRAST_Vector3Normalize(specularnormal);
3192
3193                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3194                         specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3195                         specular = pow(specular, SpecularPower * glosstex[3]);
3196                         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3197                         {
3198                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3199                                 attenuation *= (1.0f / 255.0f);
3200                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3201                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3202                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3203                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
3204                         }
3205                         else
3206                         {
3207                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
3208                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
3209                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
3210                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
3211                         }
3212                         buffer_FragColorbgra8[x*4+0] = d[0];
3213                         buffer_FragColorbgra8[x*4+1] = d[1];
3214                         buffer_FragColorbgra8[x*4+2] = d[2];
3215                         buffer_FragColorbgra8[x*4+3] = d[3];
3216                 }
3217         }
3218         else if (dpsoftrast.shader_permutation & SHADERPERMUTATION_DIFFUSE)
3219         {
3220                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3221                 for (x = startx;x < endx;x++)
3222                 {
3223                         z = buffer_z[x];
3224                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3225                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3226                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3227                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3228                         if (attenuation < 0.01f)
3229                                 continue;
3230                         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3231                         {
3232                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3233                                 if (attenuation < 0.01f)
3234                                         continue;
3235                         }
3236
3237                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3238                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3239                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3240                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3241                         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3242                         {
3243                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3244                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3245                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3246                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3247                         }
3248                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3249                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3250                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3251                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3252
3253                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3254                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3255                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3256                         DPSOFTRAST_Vector3Normalize(lightnormal);
3257
3258                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3259                         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3260                         {
3261                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3262                                 attenuation *= (1.0f / 255.0f);
3263                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3264                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3265                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3266                                 d[3] = (int)( diffusetex[3]                                                                                                                   );if (d[3] > 255) d[3] = 255;
3267                         }
3268                         else
3269                         {
3270                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
3271                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
3272                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
3273                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
3274                         }
3275                         buffer_FragColorbgra8[x*4+0] = d[0];
3276                         buffer_FragColorbgra8[x*4+1] = d[1];
3277                         buffer_FragColorbgra8[x*4+2] = d[2];
3278                         buffer_FragColorbgra8[x*4+3] = d[3];
3279                 }
3280         }
3281         else
3282         {
3283                 for (x = startx;x < endx;x++)
3284                 {
3285                         z = buffer_z[x];
3286                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3287                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3288                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3289                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3290                         if (attenuation < 0.01f)
3291                                 continue;
3292                         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3293                         {
3294                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3295                                 if (attenuation < 0.01f)
3296                                         continue;
3297                         }
3298
3299                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3300                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3301                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3302                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3303                         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3304                         {
3305                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3306                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3307                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3308                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3309                         }
3310                         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3311                         {
3312                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3313                                 attenuation *= (1.0f / 255.0f);
3314                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3315                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3316                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3317                                 d[3] = (int)( diffusetex[3]                                                                                      );if (d[3] > 255) d[3] = 255;
3318                         }
3319                         else
3320                         {
3321                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
3322                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
3323                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
3324                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
3325                         }
3326                         buffer_FragColorbgra8[x*4+0] = d[0];
3327                         buffer_FragColorbgra8[x*4+1] = d[1];
3328                         buffer_FragColorbgra8[x*4+2] = d[2];
3329                         buffer_FragColorbgra8[x*4+3] = d[3];
3330                 }
3331         }
3332         DPSOFTRAST_Draw_Span_FinishBGRA8(span, buffer_FragColorbgra8);
3333 }
3334
3335
3336
3337 void DPSOFTRAST_VertexShader_Refraction(void)
3338 {
3339         DPSOFTRAST_Array_Transform(dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.draw.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3340 }
3341
3342 void DPSOFTRAST_PixelShader_Refraction(const DPSOFTRAST_State_Draw_Span * RESTRICT span)
3343 {
3344         // TODO: IMPLEMENT
3345         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3346         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3347         DPSOFTRAST_Draw_Span_Begin(span, buffer_z);
3348         memset(buffer_FragColorbgra8, 0, span->length*4);
3349         DPSOFTRAST_Draw_Span_FinishBGRA8(span, buffer_FragColorbgra8);
3350 }
3351
3352
3353
3354 void DPSOFTRAST_VertexShader_Water(void)
3355 {
3356         DPSOFTRAST_Array_Transform(dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.draw.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3357 }
3358
3359
3360 void DPSOFTRAST_PixelShader_Water(const DPSOFTRAST_State_Draw_Span * RESTRICT span)
3361 {
3362         // TODO: IMPLEMENT
3363         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3364         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3365         DPSOFTRAST_Draw_Span_Begin(span, buffer_z);
3366         memset(buffer_FragColorbgra8, 0, span->length*4);
3367         DPSOFTRAST_Draw_Span_FinishBGRA8(span, buffer_FragColorbgra8);
3368 }
3369
3370
3371
3372 void DPSOFTRAST_VertexShader_ShowDepth(void)
3373 {
3374         DPSOFTRAST_Array_Transform(dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.draw.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3375 }
3376
3377 void DPSOFTRAST_PixelShader_ShowDepth(const DPSOFTRAST_State_Draw_Span * RESTRICT span)
3378 {
3379         // TODO: IMPLEMENT
3380         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3381         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3382         DPSOFTRAST_Draw_Span_Begin(span, buffer_z);
3383         memset(buffer_FragColorbgra8, 0, span->length*4);
3384         DPSOFTRAST_Draw_Span_FinishBGRA8(span, buffer_FragColorbgra8);
3385 }
3386
3387
3388
3389 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
3390 {
3391         DPSOFTRAST_Array_Transform(dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.draw.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3392 }
3393
3394 void DPSOFTRAST_PixelShader_DeferredGeometry(const DPSOFTRAST_State_Draw_Span * RESTRICT span)
3395 {
3396         // TODO: IMPLEMENT
3397         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3398         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3399         DPSOFTRAST_Draw_Span_Begin(span, buffer_z);
3400         memset(buffer_FragColorbgra8, 0, span->length*4);
3401         DPSOFTRAST_Draw_Span_FinishBGRA8(span, buffer_FragColorbgra8);
3402 }
3403
3404
3405
3406 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
3407 {
3408         DPSOFTRAST_Array_Transform(dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.draw.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3409 }
3410
3411 void DPSOFTRAST_PixelShader_DeferredLightSource(const DPSOFTRAST_State_Draw_Span * RESTRICT span)
3412 {
3413         // TODO: IMPLEMENT
3414         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3415         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3416         DPSOFTRAST_Draw_Span_Begin(span, buffer_z);
3417         memset(buffer_FragColorbgra8, 0, span->length*4);
3418         DPSOFTRAST_Draw_Span_FinishBGRA8(span, buffer_FragColorbgra8);
3419 }
3420
3421
3422
3423 typedef struct DPSOFTRAST_ShaderModeInfo_s
3424 {
3425         int lodarrayindex;
3426         void (*Vertex)(void);
3427         void (*Span)(const DPSOFTRAST_State_Draw_Span * RESTRICT span);
3428 }
3429 DPSOFTRAST_ShaderModeInfo;
3430
3431 DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
3432 {
3433         {2, DPSOFTRAST_VertexShader_Generic,                        DPSOFTRAST_PixelShader_Generic,                      },
3434         {2, DPSOFTRAST_VertexShader_PostProcess,                    DPSOFTRAST_PixelShader_PostProcess,                  },
3435         {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow,                DPSOFTRAST_PixelShader_Depth_Or_Shadow,              },
3436         {2, DPSOFTRAST_VertexShader_FlatColor,                      DPSOFTRAST_PixelShader_FlatColor,                    },
3437         {2, DPSOFTRAST_VertexShader_VertexColor,                    DPSOFTRAST_PixelShader_VertexColor,                  },
3438         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                     },
3439         {2, DPSOFTRAST_VertexShader_FakeLight,                      DPSOFTRAST_PixelShader_FakeLight,                    },
3440         {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace, },
3441         {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace},
3442         {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,               },
3443         {2, DPSOFTRAST_VertexShader_LightSource,                    DPSOFTRAST_PixelShader_LightSource,                  },
3444         {2, DPSOFTRAST_VertexShader_Refraction,                     DPSOFTRAST_PixelShader_Refraction,                   },
3445         {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                        },
3446         {2, DPSOFTRAST_VertexShader_ShowDepth,                      DPSOFTRAST_PixelShader_ShowDepth,                    },
3447         {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,             },
3448         {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,          }
3449 };
3450
3451
3452
3453 void DPSOFTRAST_Draw_ProcessSpans(void)
3454 {
3455         int i;
3456         int x;
3457         int startx;
3458         int endx;
3459         int numspans = dpsoftrast.draw.numspans;
3460 //      unsigned int c;
3461 //      unsigned int *colorpixel;
3462         unsigned int *depthpixel;
3463         float w;
3464         float wslope;
3465         int depth;
3466         int depthslope;
3467         unsigned int d;
3468         DPSOFTRAST_State_Draw_Span *span = dpsoftrast.draw.spanqueue;
3469         unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3470         for (i = 0;i < numspans;i++, span++)
3471         {
3472                 w = span->data[0][DPSOFTRAST_ARRAY_TOTAL][3];
3473                 wslope = span->data[1][DPSOFTRAST_ARRAY_TOTAL][3];
3474                 if (dpsoftrast.user.depthtest && dpsoftrast.fb_depthpixels)
3475                 {
3476                         depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
3477                         depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(dpsoftrast.user.polygonoffset[1] + fabs(wslope)*dpsoftrast.user.polygonoffset[0]));
3478                         depthpixel = dpsoftrast.fb_depthpixels + span->start;
3479                         switch(dpsoftrast.fb_depthfunc)
3480                         {
3481                         default:
3482                         case GL_ALWAYS:  for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = true; break;
3483                         case GL_LESS:    for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
3484                         case GL_LEQUAL:  for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
3485                         case GL_EQUAL:   for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
3486                         case GL_GEQUAL:  for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
3487                         case GL_GREATER: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
3488                         case GL_NEVER:   for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = false; break;
3489                         }
3490                         //colorpixel = dpsoftrast.fb_colorpixels[0] + span->start;
3491                         //for (x = 0;x < span->length;x++)
3492                         //      colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
3493                         // if there is no color buffer, skip pixel shader
3494                         startx = 0;
3495                         endx = span->length;
3496                         while (startx < endx && !pixelmask[startx])
3497                                 startx++;
3498                         while (endx > startx && !pixelmask[endx-1])
3499                                 endx--;
3500                         if (startx >= endx)
3501                                 continue; // no pixels to fill
3502                         span->pixelmask = pixelmask;
3503                         span->startx = startx;
3504                         span->endx = endx;
3505                         // run pixel shader if appropriate
3506                         // do this before running depthmask code, to allow the pixelshader
3507                         // to clear pixelmask values for alpha testing
3508                         if (dpsoftrast.fb_colorpixels[0] && dpsoftrast.fb_colormask)
3509                                 DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Span(span);
3510                         if (dpsoftrast.user.depthmask)
3511                                 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
3512                                         if (pixelmask[x])
3513                                                 depthpixel[x] = d;
3514                 }
3515                 else
3516                 {
3517                         // no depth testing means we're just dealing with color...
3518                         // if there is no color buffer, skip pixel shader
3519                         if (dpsoftrast.fb_colorpixels[0] && dpsoftrast.fb_colormask)
3520                         {
3521                                 memset(pixelmask, 1, span->length);
3522                                 span->pixelmask = pixelmask;
3523                                 span->startx = 0;
3524                                 span->endx = span->length;
3525                                 DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Span(span);
3526                         }
3527                 }
3528         }
3529 }
3530
3531 void DPSOFTRAST_Draw_ProcessTriangles(int firstvertex, int numtriangles, const int *element3i, const unsigned short *element3s, unsigned char *arraymask)
3532 {
3533 #ifdef SSE2_PRESENT
3534         int cullface = dpsoftrast.user.cullface;
3535         int width = dpsoftrast.fb_width;
3536         int height = dpsoftrast.fb_height;
3537         __m128i fbmax = _mm_sub_epi16(_mm_setr_epi16(width, height, width, height, width, height, width, height), _mm_set1_epi16(1));
3538         int i;
3539         int j;
3540         int k;
3541         int y;
3542         int e[3];
3543         __m128i screeny;
3544         int starty, endy;
3545         int numpoints;
3546         int edge0p;
3547         int edge0n;
3548         int edge1p;
3549         int edge1n;
3550         int startx;
3551         int endx;
3552         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS];
3553         __m128 mipedgescale;
3554         float clipdist[4];
3555         __m128 clipfrac[4];
3556         __m128 clipped[DPSOFTRAST_ARRAY_TOTAL][4];
3557         __m128 screen[4];
3558         __m128 proj[DPSOFTRAST_ARRAY_TOTAL][4];
3559         DPSOFTRAST_Texture *texture;
3560         DPSOFTRAST_State_Draw_Span *span;
3561         DPSOFTRAST_State_Draw_Span *oldspan;
3562         for (i = 0;i < numtriangles;i++)
3563         {
3564                 // generate the 3 edges of this triangle
3565                 // generate spans for the triangle - switch based on left split or right split classification of triangle
3566                 if (element3i)
3567                 {
3568                         e[0] = element3i[i*3+0] - firstvertex;
3569                         e[1] = element3i[i*3+1] - firstvertex;
3570                         e[2] = element3i[i*3+2] - firstvertex;
3571                 }
3572                 else if (element3s)
3573                 {
3574                         e[0] = element3s[i*3+0] - firstvertex;
3575                         e[1] = element3s[i*3+1] - firstvertex;
3576                         e[2] = element3s[i*3+2] - firstvertex;
3577                 }
3578                 else
3579                 {
3580                         e[0] = i*3+0;
3581                         e[1] = i*3+1;
3582                         e[2] = i*3+2;
3583                 }
3584
3585 #define SKIPBACKFACE \
3586                 if(cullface != GL_NONE) \
3587                 { \
3588                         __m128 triangleedge[2] = { _mm_sub_ps(screen[0], screen[1]), _mm_sub_ps(screen[2], screen[1]) }; \
3589                         /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
3590                         __m128 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge[0], _mm_shuffle_ps(triangleedge[1], triangleedge[1], _MM_SHUFFLE(3, 0, 2, 1))), \
3591                                                                                         _mm_mul_ss(_mm_shuffle_ps(triangleedge[0], triangleedge[0], _MM_SHUFFLE(3, 0, 2, 1)), triangleedge[1])); \
3592                         /* apply current cullface mode (this culls many triangles) */ \
3593                         switch(cullface) \
3594                         { \
3595                         case GL_BACK: \
3596                                 if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
3597                                         continue; \
3598                                 break; \
3599                         case GL_FRONT: \
3600                                 if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
3601                                         continue; \
3602                                 break; \
3603                         } \
3604                 }
3605                         //trianglenormal = _mm_sub_ps(_mm_mul_ps(triangleedge[0], _mm_shuffle_ps(triangleedge[1], triangleedge[1], _MM_SHUFFLE(3, 0, 2, 1))),
3606                         //                                                _mm_mul_ps(_mm_shuffle_ps(triangleedge[0], triangleedge[0], _MM_SHUFFLE(3, 0, 2, 1)), triangleedge[1]));
3607                         //trianglenormal[2] = triangleedge[0][0] * triangleedge[1][1] - triangleedge[0][1] * triangleedge[1][0];
3608                         //trianglenormal[0] = triangleedge[0][1] * triangleedge[1][2] - triangleedge[0][2] * triangleedge[1][1];
3609                         //trianglenormal[1] = triangleedge[0][2] * triangleedge[1][0] - triangleedge[0][0] * triangleedge[1][2];
3610
3611                         // macros for clipping vertices
3612 #define CLIPPEDVERTEXLERP(k,p1, p2) \
3613                         clipfrac[k] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
3614                         { \
3615                                 __m128 v1 = _mm_load_ps(&dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[p1]*4]), v2 = _mm_load_ps(&dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[p2]*4]); \
3616                                 clipped[DPSOFTRAST_ARRAY_POSITION][k] = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[k])); \
3617                         } \
3618                         screen[k] = DPSOFTRAST_Draw_ProjectVertex(clipped[DPSOFTRAST_ARRAY_POSITION][k]);
3619 #define CLIPPEDATTRIBSLERP(k,p1,p2) \
3620                         for (j = DPSOFTRAST_ARRAY_POSITION+1;j < DPSOFTRAST_ARRAY_TOTAL;j++)\
3621                         {\
3622                                 /*if (arraymask[j])*/\
3623                                 {\
3624                                         __m128 v1 = _mm_load_ps(&dpsoftrast.draw.post_array4f[j][e[p1]*4]), v2 = _mm_load_ps(&dpsoftrast.draw.post_array4f[j][e[p2]*4]); \
3625                                         clipped[j][k] = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[k])); \
3626                                 }\
3627                         }
3628 #define CLIPPEDVERTEXCOPY(k,p1) \
3629                         screen[k] = _mm_load_ps(&dpsoftrast.draw.screencoord4f[e[p1]*4]);
3630 #define CLIPPEDATTRIBSCOPY(k,p1) \
3631                         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL;j++)\
3632                         {\
3633                                 /*if (arraymask[j])*/\
3634                                 {\
3635                                         clipped[j][k] = _mm_load_ps(&dpsoftrast.draw.post_array4f[j][e[p1]*4]); \
3636                                 }\
3637                         }
3638
3639                 // calculate distance from nearplane
3640                 clipdist[0] = dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[0]*4+2] + dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[0]*4+3];
3641                 clipdist[1] = dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[1]*4+2] + dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[1]*4+3];
3642                 clipdist[2] = dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[2]*4+2] + dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[2]*4+3];
3643                 if (clipdist[0] >= 0.0f)
3644                 {
3645                         if (clipdist[1] >= 0.0f)
3646                         {
3647                                 if (clipdist[2] >= 0.0f)
3648                                 {
3649                                         // triangle is entirely in front of nearplane
3650                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
3651                                         numpoints = 3;
3652                                         SKIPBACKFACE;
3653                                         CLIPPEDATTRIBSCOPY(0,0); CLIPPEDATTRIBSCOPY(1,1); CLIPPEDATTRIBSCOPY(2,2);
3654                                 }
3655                                 else
3656                                 {
3657                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
3658                                         numpoints = 4;
3659                                         SKIPBACKFACE;
3660                                         CLIPPEDATTRIBSCOPY(0,0); CLIPPEDATTRIBSCOPY(1,1); CLIPPEDATTRIBSLERP(2,1,2); CLIPPEDATTRIBSLERP(3,2,0);
3661                                 }
3662                         }
3663                         else 
3664                         {
3665                                 if (clipdist[2] >= 0.0f)
3666                                 {
3667                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2);     CLIPPEDVERTEXCOPY(3,2);
3668                                         numpoints = 4;
3669                                         SKIPBACKFACE;
3670                                         CLIPPEDATTRIBSCOPY(0,0); CLIPPEDATTRIBSLERP(1,0,1); CLIPPEDATTRIBSLERP(2,1,2); CLIPPEDATTRIBSCOPY(3,2);
3671                                 }
3672                                 else
3673                                 {
3674                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
3675                                         numpoints = 3;
3676                                         SKIPBACKFACE;
3677                                         CLIPPEDATTRIBSCOPY(0,0); CLIPPEDATTRIBSLERP(1,0,1); CLIPPEDATTRIBSLERP(2,2,0);
3678                                 }
3679                         }
3680                 }                       
3681                 else if (clipdist[1] >= 0.0f)
3682                 {
3683                         if (clipdist[2] >= 0.0f)
3684                         {
3685                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
3686                                 numpoints = 4;
3687                                 SKIPBACKFACE;
3688                                 CLIPPEDATTRIBSLERP(0,0,1); CLIPPEDATTRIBSCOPY(1,1); CLIPPEDATTRIBSCOPY(2,2); CLIPPEDATTRIBSLERP(3,2,0);
3689                         }
3690                         else
3691                         {
3692                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
3693                                 numpoints = 3;
3694                                 SKIPBACKFACE;
3695                                 CLIPPEDATTRIBSLERP(0,0,1); CLIPPEDATTRIBSCOPY(1,1); CLIPPEDATTRIBSLERP(2,1,2);
3696                         }
3697                 }
3698                 else if (clipdist[2] >= 0.0f)
3699                 {
3700                         CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
3701                         numpoints = 3;
3702                         SKIPBACKFACE;
3703                         CLIPPEDATTRIBSLERP(0,1,2); CLIPPEDATTRIBSCOPY(1,2); CLIPPEDATTRIBSLERP(2,2,0);
3704                 }
3705                 else continue; // triangle is entirely behind nearplane
3706
3707                 {
3708                         // calculate integer y coords for triangle points
3709                         __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_shuffle_ps(screen[0], screen[1], _MM_SHUFFLE(1, 0, 1, 0))),
3710                                                                                   _mm_cvttps_epi32(_mm_shuffle_ps(screen[2], numpoints <= 3 ? screen[2] : screen[3], _MM_SHUFFLE(1, 0, 1, 0)))),
3711                                         screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)), 
3712                                         screenmin = _mm_min_epi16(screeni, screenir), 
3713                                         screenmax = _mm_max_epi16(screeni, screenir);
3714                         screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
3715                         screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
3716                         screenmin = _mm_max_epi16(screenmin, _mm_setzero_si128());
3717                         screenmax = _mm_min_epi16(screenmax, fbmax);
3718                         // skip offscreen triangles
3719                         if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
3720                                 continue;
3721                         starty = _mm_extract_epi16(screenmin, 1);
3722                         endy = _mm_extract_epi16(screenmax, 1)+1;
3723                         screeny = _mm_srai_epi32(screeni, 16);
3724                 }
3725
3726                 // okay, this triangle is going to produce spans, we'd better project
3727                 // the interpolants now (this is what gives perspective texturing),
3728                 // this consists of simply multiplying all arrays by the W coord
3729                 // (which is basically 1/Z), which will be undone per-pixel
3730                 // (multiplying by Z again) to get the perspective-correct array
3731                 // values
3732                 for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL;j++)
3733                 {
3734                         //if (arraymask[j])
3735                         {
3736                                 for (k = 0;k < numpoints;k++)
3737                                 {
3738                                         proj[j][k] = _mm_mul_ps(clipped[j][k], _mm_shuffle_ps(screen[k], screen[k], _MM_SHUFFLE(3, 3, 3, 3)));
3739                                 }
3740                         }
3741                 }
3742                 // adjust texture LOD by texture density, in the simplest way possible...
3743                 mipedgescale = _mm_sub_ps(_mm_shuffle_ps(screen[0], screen[2], _MM_SHUFFLE(1, 0, 1, 0)), _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 0, 1, 0)));
3744                 mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
3745                 mipedgescale = _mm_div_ps(_mm_set1_ps(1.0f), _mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
3746                 for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
3747                 {
3748                         texture = dpsoftrast.texbound[j];
3749                         if (texture)
3750                         {
3751                                 __m128 mipedgetc;
3752                                 if (texture->filter <= DPSOFTRAST_TEXTURE_FILTER_LINEAR)
3753                                 {
3754                                         mip[j] = 0;
3755                                         continue;
3756                                 }
3757                                 k = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].lodarrayindex;
3758                                 mipedgetc = _mm_sub_ps(_mm_shuffle_ps(clipped[k][0], clipped[k][2], _MM_SHUFFLE(1, 0, 1, 0)),
3759                                                                                 _mm_shuffle_ps(clipped[k][1], clipped[k][1], _MM_SHUFFLE(1, 0, 1, 0)));
3760                                 mipedgetc = _mm_mul_ps(mipedgetc, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
3761                                 mipedgetc = _mm_mul_ps(mipedgetc, mipedgetc);
3762                                 mipedgetc = _mm_add_ps(mipedgetc, _mm_shuffle_ps(mipedgetc, mipedgetc, _MM_SHUFFLE(2, 3, 0, 1)));
3763                                 mipedgetc = _mm_mul_ps(mipedgetc, mipedgescale);
3764                                 mipedgetc = _mm_min_ss(mipedgetc, _mm_shuffle_ps(mipedgetc, mipedgetc, _MM_SHUFFLE(2, 2, 2, 2)));       
3765                                 // this will be multiplied in the texturing routine by the texture resolution
3766                                 y = _mm_cvtss_si32(mipedgetc);
3767                                 if (y > 0) 
3768                                 {
3769                                         y = (int)(log((float)y)/M_LN2);
3770                                         if (y > texture->mipmaps - 1)
3771                                                 y = texture->mipmaps - 1;
3772                                 }
3773                                 else y = 0;
3774                                 mip[j] = y;
3775                         }
3776                 }
3777                 // iterate potential spans
3778                 // TODO: optimize?  if we figured out the edge order beforehand, this
3779                 //         could do loops over the edges in the proper order rather than
3780                 //         selecting them for each span
3781                 // TODO: optimize?  the edges could have data slopes calculated
3782                 // TODO: optimize?  the data slopes could be calculated as a plane
3783                 //         (2D slopes) to avoid any interpolation along edges at all
3784                 for (y = starty+1;y < endy;)
3785                 {
3786                         int nexty = -1;
3787                         __m128 edge0offset, edge1offset, edge0scale, edge1scale, data[DPSOFTRAST_ARRAY_TOTAL+1][2], slope[DPSOFTRAST_ARRAY_TOTAL+1][2];
3788                         __m128i screenycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
3789                         int screenymask = _mm_movemask_epi8(screenycc);
3790                         if (numpoints == 4)
3791                         {
3792                                 switch(screenymask)
3793                                 {
3794                                 default:
3795                                 case 0xFFFF: /*0000*/ y++; continue;
3796                                 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
3797                                 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
3798                                 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
3799                                 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
3800                                 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
3801                                 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
3802                                 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
3803                                 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
3804                                 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
3805                                 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
3806                                 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
3807                                 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
3808                                 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
3809                                 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
3810                                 case 0x0000: /*1111*/ y++; continue;
3811                                 }
3812                         }
3813                         else
3814                         {
3815                                 switch(screenymask)
3816                                 {
3817                                 default:
3818                                 case 0xFFFF: /*000*/ y++; continue;
3819                                 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
3820                                 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
3821                                 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
3822                                 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
3823                                 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
3824                                 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
3825                                 case 0x0000: /*111*/ y++; continue;
3826                                 }
3827                         }
3828                         screenycc = _mm_max_epi16(_mm_srli_epi16(screenycc, 1), screeny);
3829                         screenycc = _mm_min_epi16(screenycc, _mm_shuffle_epi32(screenycc, _MM_SHUFFLE(1, 0, 3, 2)));  
3830                         screenycc = _mm_min_epi16(screenycc, _mm_shuffle_epi32(screenycc, _MM_SHUFFLE(2, 3, 0, 1)));
3831                         nexty = _mm_extract_epi16(screenycc, 0);        
3832                         if(nexty >= endy) nexty = endy-1;
3833                         if (_mm_ucomigt_ss(_mm_max_ss(screen[edge0n], screen[edge0p]), _mm_min_ss(screen[edge1n], screen[edge1p])))
3834                         {
3835                                 int tmp = edge0n;
3836                                 edge0n = edge1n;
3837                                 edge1n = tmp;
3838                                 tmp = edge0p;
3839                                 edge0p = edge1p;
3840                                 edge1p = tmp;
3841                         }       
3842                         edge0offset = _mm_shuffle_ps(screen[edge0p], screen[edge0p], _MM_SHUFFLE(1, 1, 1, 1));
3843                         edge0scale = _mm_div_ss(_mm_set1_ps(1.0f), _mm_sub_ss(_mm_shuffle_ps(screen[edge0n], screen[edge0n], _MM_SHUFFLE(1, 1, 1, 1)), edge0offset));
3844                         edge0scale = _mm_shuffle_ps(edge0scale, edge0scale, _MM_SHUFFLE(0, 0, 0, 0));
3845                         edge0offset = _mm_sub_ps(_mm_set1_ps(y), edge0offset);
3846                         edge1offset = _mm_shuffle_ps(screen[edge1p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1));
3847                         edge1scale = _mm_div_ss(_mm_set1_ps(1.0f), _mm_sub_ss(_mm_shuffle_ps(screen[edge1n], screen[edge1n], _MM_SHUFFLE(1, 1, 1, 1)), edge1offset));
3848                         edge1offset = _mm_sub_ps(_mm_set1_ps(y), edge1offset);
3849                         edge1scale = _mm_shuffle_ps(edge1scale, edge1scale, _MM_SHUFFLE(0, 0, 0, 0));
3850                         j = DPSOFTRAST_ARRAY_TOTAL;
3851                         slope[j][0] = _mm_mul_ps(_mm_sub_ps(screen[edge0n], screen[edge0p]), edge0scale);
3852                         slope[j][1] = _mm_mul_ps(_mm_sub_ps(screen[edge1n], screen[edge1p]), edge1scale);
3853                         data[j][0] = _mm_add_ps(_mm_mul_ps(slope[j][0], edge0offset), screen[edge0p]);
3854                         data[j][1] = _mm_add_ps(_mm_mul_ps(slope[j][1], edge1offset), screen[edge1p]);
3855                         data[j][1] = _mm_sub_ps(data[j][1], data[j][0]);
3856                         slope[j][1] = _mm_sub_ps(slope[j][1], slope[j][0]);
3857                         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL;j++)
3858                         {
3859                                 //if (arraymask[j])
3860                                 {
3861                                         slope[j][0] = _mm_mul_ps(_mm_sub_ps(proj[j][edge0n], proj[j][edge0p]), edge0scale);
3862                                         slope[j][1] = _mm_mul_ps(_mm_sub_ps(proj[j][edge1n], proj[j][edge1p]), edge1scale);
3863                                         data[j][0] = _mm_add_ps(_mm_mul_ps(slope[j][0], edge0offset), proj[j][edge0p]);
3864                                         data[j][1] = _mm_add_ps(_mm_mul_ps(slope[j][1], edge1offset), proj[j][edge1p]);
3865                                         data[j][1] = _mm_sub_ps(data[j][1], data[j][0]);
3866                                         slope[j][1] = _mm_sub_ps(slope[j][1], slope[j][0]);
3867                                 }
3868                         }
3869                         goto firstspan;
3870                         for(; y <= nexty; y++)
3871                         {
3872                                 __m128 data0, data1, spanilength, startxlerp;
3873                                 j = DPSOFTRAST_ARRAY_TOTAL;
3874                                 data[j][0] = _mm_add_ps(data[j][0], slope[j][0]);
3875                                 data[j][1] = _mm_add_ps(data[j][1], slope[j][1]);
3876                                 for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL;j++)
3877                                 {
3878                                         //if (arraymask[j])
3879                                         {
3880                                                 data[j][0] = _mm_add_ps(data[j][0], slope[j][0]);
3881                                                 data[j][1] = _mm_add_ps(data[j][1], slope[j][1]);
3882                                         }
3883                                 }
3884
3885                         firstspan:
3886                                 startx = _mm_cvtss_si32(_mm_add_ss(data[DPSOFTRAST_ARRAY_TOTAL][0], _mm_set1_ps(0.5f)));
3887                                 endx = _mm_cvtss_si32(_mm_add_ss(_mm_add_ss(data[DPSOFTRAST_ARRAY_TOTAL][0], data[DPSOFTRAST_ARRAY_TOTAL][1]), _mm_set1_ps(0.5f)));
3888                                 if (startx < 0) startx = 0;
3889                                 if (endx > width) endx = width;
3890                                 if (startx >= endx) continue;
3891 #if 0
3892                                 _mm_store_ss(&startxf, data0);
3893                                 _mm_store_ss(&endxf, data1);
3894                                 if (startxf > startx || endxf < endx-1) { printf("%s:%i X wrong (%i to %i is outside %f to %f)\n", __FILE__, __LINE__, startx, endx, startxf, endxf); }
3895 #endif
3896                                 spanilength = _mm_div_ss(_mm_set1_ps(1.0f), data[DPSOFTRAST_ARRAY_TOTAL][1]);
3897                                 spanilength = _mm_shuffle_ps(spanilength, spanilength, _MM_SHUFFLE(0, 0, 0, 0));
3898                                 startxlerp = _mm_sub_ss(_mm_cvtsi32_ss(_mm_setzero_ps(), startx), data[DPSOFTRAST_ARRAY_TOTAL][0]);
3899                                 startxlerp = _mm_shuffle_ps(startxlerp, startxlerp, _MM_SHUFFLE(0, 0, 0, 0));
3900                                 span = &dpsoftrast.draw.spanqueue[dpsoftrast.draw.numspans++];
3901                                 memcpy(span->mip, mip, sizeof(span->mip));
3902                                 span->start = y * width + startx;
3903                                 span->length = endx - startx;
3904                                 j = DPSOFTRAST_ARRAY_TOTAL;
3905                                 data1 = _mm_mul_ps(data[j][1], spanilength);
3906                                 data0 = _mm_add_ps(data[j][0], _mm_mul_ps(data1, startxlerp));
3907                                 _mm_store_ps(span->data[0][j], data0);
3908                                 _mm_store_ps(span->data[1][j], data1);
3909                                 for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL;j++)
3910                                 {
3911                                         //if (arraymask[j])
3912                                         {
3913                                                 data1 = _mm_mul_ps(data[j][1], spanilength);
3914                                                 data0 = _mm_add_ps(data[j][0], _mm_mul_ps(data1, startxlerp));
3915                                                 _mm_store_ps(span->data[0][j], data0);
3916                                                 _mm_store_ps(span->data[1][j], data1);
3917                                         }
3918                                 }
3919                                 // to keep the shader routines from needing more than a small
3920                                 // buffer for pixel intermediate data, we split long spans...
3921                                 while (span->length > DPSOFTRAST_DRAW_MAXSPANLENGTH)
3922                                 {
3923                                         span->length = DPSOFTRAST_DRAW_MAXSPANLENGTH;
3924                                         if (dpsoftrast.draw.numspans >= DPSOFTRAST_DRAW_MAXSPANQUEUE)
3925                                         {
3926                                                 DPSOFTRAST_Draw_ProcessSpans();
3927                                                 dpsoftrast.draw.numspans = 0;
3928                                         }
3929                                         oldspan = span;
3930                                         span = &dpsoftrast.draw.spanqueue[dpsoftrast.draw.numspans++];
3931                                         *span = *oldspan;
3932                                         startx += DPSOFTRAST_DRAW_MAXSPANLENGTH;
3933                                         span->start = y * width + startx;
3934                                         span->length = endx - startx;
3935                                         j = DPSOFTRAST_ARRAY_TOTAL;
3936                                         _mm_store_ps(span->data[0][j], _mm_add_ps(_mm_load_ps(span->data[0][j]), _mm_mul_ps(_mm_load_ps(span->data[1][j]), _mm_set1_ps(DPSOFTRAST_DRAW_MAXSPANLENGTH))));
3937                                         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL;j++)
3938                                         {
3939                                                 //if (arraymask[j])
3940                                                 {
3941                                                          _mm_store_ps(span->data[0][j], _mm_add_ps(_mm_load_ps(span->data[0][j]), _mm_mul_ps(_mm_load_ps(span->data[1][j]), _mm_set1_ps(DPSOFTRAST_DRAW_MAXSPANLENGTH))));
3942                                                 }
3943                                         }
3944                                 }
3945                                 // after all that, we have a span suitable for the pixel shader...
3946                                 if (dpsoftrast.draw.numspans >= DPSOFTRAST_DRAW_MAXSPANQUEUE)
3947                                 {
3948                                         DPSOFTRAST_Draw_ProcessSpans();
3949                                         dpsoftrast.draw.numspans = 0;
3950                                 }
3951                         }
3952                 }
3953                 // draw outlines over triangle for debugging
3954         //      for (j = 0, k = numpoints-1;j < numpoints;k = j, j++)
3955         //              DPSOFTRAST_Draw_DebugEdgePoints(screen[k], screen[j]);
3956         }
3957         if (dpsoftrast.draw.numspans)
3958         {
3959                 DPSOFTRAST_Draw_ProcessSpans();
3960                 dpsoftrast.draw.numspans = 0;
3961         }
3962 #endif
3963 }
3964
3965 void DPSOFTRAST_Draw_DebugPoints(void)
3966 {
3967         int i;
3968         int x;
3969         int y;
3970         int numvertices = dpsoftrast.draw.numvertices;
3971         int w = dpsoftrast.fb_width;
3972         int bounds[4];
3973         unsigned int *pixels = dpsoftrast.fb_colorpixels[0];
3974         const float *c4f;
3975         bounds[0] = dpsoftrast.fb_viewportscissor[0];
3976         bounds[1] = dpsoftrast.fb_viewportscissor[1];
3977         bounds[2] = dpsoftrast.fb_viewportscissor[0] + dpsoftrast.fb_viewportscissor[2];
3978         bounds[3] = dpsoftrast.fb_viewportscissor[1] + dpsoftrast.fb_viewportscissor[3];
3979         for (i = 0;i < numvertices;i++)
3980         {
3981                 // check nearclip
3982                 //if (dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+3] != 1.0f)
3983                 //      continue;
3984                 x = (int)(dpsoftrast.draw.screencoord4f[i*4+0]);
3985                 y = (int)(dpsoftrast.draw.screencoord4f[i*4+1]);
3986                 //x = (int)(dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0] + 0.5f);
3987                 //y = (int)(dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1] + 0.5f);
3988                 //x = (int)((dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0] + 1.0f) * dpsoftrast.fb_width * 0.5f + 0.5f);
3989                 //y = (int)((dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1] + 1.0f) * dpsoftrast.fb_height * 0.5f + 0.5f);
3990                 if (x < bounds[0] || y < bounds[1] || x >= bounds[2] || y >= bounds[3])
3991                         continue;
3992                 c4f = dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_COLOR] + i*4;
3993                 pixels[y*w+x] = DPSOFTRAST_BGRA8_FROM_RGBA32F(c4f[0], c4f[1], c4f[2], c4f[3]);
3994         }
3995 }
3996
3997 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
3998 {
3999         unsigned char arraymask[DPSOFTRAST_ARRAY_TOTAL];
4000         arraymask[0] = true;
4001         arraymask[1] = dpsoftrast.fb_colorpixels[0] != NULL; // TODO: optimize (decide based on shadermode)
4002         arraymask[2] = dpsoftrast.pointer_texcoordf[0] != NULL;
4003         arraymask[3] = dpsoftrast.pointer_texcoordf[1] != NULL;
4004         arraymask[4] = dpsoftrast.pointer_texcoordf[2] != NULL;
4005         arraymask[5] = dpsoftrast.pointer_texcoordf[3] != NULL;
4006         arraymask[6] = dpsoftrast.pointer_texcoordf[4] != NULL;
4007         arraymask[7] = dpsoftrast.pointer_texcoordf[5] != NULL;
4008         arraymask[8] = dpsoftrast.pointer_texcoordf[6] != NULL;
4009         arraymask[9] = dpsoftrast.pointer_texcoordf[7] != NULL;
4010         DPSOFTRAST_Validate(DPSOFTRAST_VALIDATE_DRAW);
4011         DPSOFTRAST_Draw_LoadVertices(firstvertex, numvertices, true);
4012         DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
4013         DPSOFTRAST_Draw_ProjectVertices(dpsoftrast.draw.screencoord4f, dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION], numvertices);
4014         DPSOFTRAST_Draw_ProcessTriangles(firstvertex, numtriangles, element3i, element3s, arraymask);
4015 }
4016
4017 void DPSOFTRAST_Init(int width, int height, unsigned int *colorpixels, unsigned int *depthpixels)
4018 {
4019         union
4020         {
4021                 int i;
4022                 unsigned char b[4];
4023         }
4024         u;
4025         u.i = 1;
4026         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
4027         dpsoftrast.bigendian = u.b[3];
4028         dpsoftrast.fb_width = width;
4029         dpsoftrast.fb_height = height;
4030         dpsoftrast.fb_depthpixels = depthpixels;
4031         dpsoftrast.fb_colorpixels[0] = colorpixels;
4032         dpsoftrast.fb_colorpixels[1] = NULL;
4033         dpsoftrast.fb_colorpixels[1] = NULL;
4034         dpsoftrast.fb_colorpixels[1] = NULL;
4035         dpsoftrast.texture_firstfree = 1;
4036         dpsoftrast.texture_end = 1;
4037         dpsoftrast.texture_max = 0;
4038         dpsoftrast.user.colormask[0] = 1;
4039         dpsoftrast.user.colormask[1] = 1;
4040         dpsoftrast.user.colormask[2] = 1;
4041         dpsoftrast.user.colormask[3] = 1;
4042         dpsoftrast.user.blendfunc[0] = GL_ONE;
4043         dpsoftrast.user.blendfunc[1] = GL_ZERO;
4044         dpsoftrast.user.depthmask = true;
4045         dpsoftrast.user.depthtest = true;
4046         dpsoftrast.user.depthfunc = GL_LEQUAL;
4047         dpsoftrast.user.scissortest = false;
4048         dpsoftrast.user.cullface = GL_BACK;
4049         dpsoftrast.user.alphatest = false;
4050         dpsoftrast.user.alphafunc = GL_GREATER;
4051         dpsoftrast.user.alphavalue = 0.5f;
4052         dpsoftrast.user.scissor[0] = 0;
4053         dpsoftrast.user.scissor[1] = 0;
4054         dpsoftrast.user.scissor[2] = dpsoftrast.fb_width;
4055         dpsoftrast.user.scissor[3] = dpsoftrast.fb_height;
4056         dpsoftrast.user.viewport[0] = 0;
4057         dpsoftrast.user.viewport[1] = 0;
4058         dpsoftrast.user.viewport[2] = dpsoftrast.fb_width;
4059         dpsoftrast.user.viewport[3] = dpsoftrast.fb_height;
4060         dpsoftrast.user.depthrange[0] = 0;
4061         dpsoftrast.user.depthrange[1] = 1;
4062         dpsoftrast.user.polygonoffset[0] = 0;
4063         dpsoftrast.user.polygonoffset[1] = 0;
4064         dpsoftrast.user.color[0] = 1;
4065         dpsoftrast.user.color[1] = 1;
4066         dpsoftrast.user.color[2] = 1;
4067         dpsoftrast.user.color[3] = 1;
4068         dpsoftrast.validate = -1;
4069         DPSOFTRAST_Validate(-1);
4070         dpsoftrast.validate = 0;
4071 }
4072
4073 void DPSOFTRAST_Shutdown(void)
4074 {
4075         int i;
4076         for (i = 0;i < dpsoftrast.texture_end;i++)
4077                 if (dpsoftrast.texture[i].bytes)
4078                         MM_FREE(dpsoftrast.texture[i].bytes);
4079         if (dpsoftrast.texture)
4080                 free(dpsoftrast.texture);
4081         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
4082 }
4083