3 #define _USE_MATH_DEFINES
6 #include "dpsoftrast.h"
11 #include <SDL_thread.h>
15 typedef qboolean bool;
19 #define ATOMIC_SIZE 32
22 #define ALIGN(var) var __attribute__((__aligned__(16)))
23 #define ATOMIC(var) var __attribute__((__aligned__(32)))
24 #define MEMORY_BARRIER (_mm_sfence())
25 //(__sync_synchronize())
26 #elif defined(_MSC_VER)
27 #define ALIGN(var) __declspec(align(16)) var
28 #define ATOMIC(var) __declspec(align(32)) var
29 #define MEMORY_BARRIER (_mm_sfence())
32 #define ALIGN(var) var
33 #define ATOMIC(var) var
34 #define MEMORY_BARRIER ((void)0)
39 #define MEMORY_BARRIER ((void)0)
43 #include <emmintrin.h>
45 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
47 static void *MM_CALLOC(size_t nmemb, size_t size)
49 void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
50 if(ptr != NULL) memset(ptr, 0, nmemb*size);
54 #define MM_FREE _mm_free
56 #define MM_MALLOC(size) malloc(size)
57 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
61 typedef enum DPSOFTRAST_ARRAY_e
63 DPSOFTRAST_ARRAY_POSITION,
64 DPSOFTRAST_ARRAY_COLOR,
65 DPSOFTRAST_ARRAY_TEXCOORD0,
66 DPSOFTRAST_ARRAY_TEXCOORD1,
67 DPSOFTRAST_ARRAY_TEXCOORD2,
68 DPSOFTRAST_ARRAY_TEXCOORD3,
69 DPSOFTRAST_ARRAY_TEXCOORD4,
70 DPSOFTRAST_ARRAY_TEXCOORD5,
71 DPSOFTRAST_ARRAY_TEXCOORD6,
72 DPSOFTRAST_ARRAY_TEXCOORD7,
73 DPSOFTRAST_ARRAY_TOTAL
77 typedef struct DPSOFTRAST_Texture_s
84 DPSOFTRAST_TEXTURE_FILTER filter;
88 int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
92 #define COMMAND_SIZE ALIGN_SIZE
93 #define COMMAND_ALIGN(var) ALIGN(var)
95 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
101 enum { DPSOFTRAST_OPCODE_Reset = 0 };
103 #define DEFCOMMAND(opcodeval, name, fields) \
104 enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
105 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
109 } DPSOFTRAST_Command_##name );
111 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
113 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
117 ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
119 DPSOFTRAST_State_Command_Pool);
121 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
124 unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
129 ALIGN(float coords[4][4]);
130 ALIGN(int ycoords[4]);
131 ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
133 DPSOFTRAST_State_Triangle);
135 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
136 slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
137 data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
138 _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
139 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
141 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
142 slope[0] = (triangle)->attribs[arrayindex][0][0]; \
143 slope[1] = (triangle)->attribs[arrayindex][0][1]; \
144 slope[2] = (triangle)->attribs[arrayindex][0][2]; \
145 slope[3] = (triangle)->attribs[arrayindex][0][3]; \
146 data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
147 data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
148 data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
149 data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
152 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
154 typedef ALIGN(struct DPSOFTRAST_State_Span_s
156 int triangle; // triangle this span was generated by
157 int x; // framebuffer x coord
158 int y; // framebuffer y coord
159 int length; // pixel count
160 int startx; // usable range (according to pixelmask)
161 int endx; // usable range (according to pixelmask)
162 unsigned char *pixelmask; // true for pixels that passed depth test, false for others
164 DPSOFTRAST_State_Span);
166 #define DPSOFTRAST_DRAW_MAXSPANS 1024
168 #define DPSOFTRAST_DRAW_MAXTRIANGLEPOOL 4096
169 #define DPSOFTRAST_DRAW_FLUSHPROCESSTRIANGLES 64
171 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_Pool_s
175 ATOMIC(DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLEPOOL]);
177 DPSOFTRAST_State_Triangle_Pool);
179 #define DPSOFTRAST_VALIDATE_FB 1
180 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
181 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
182 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
184 typedef enum DPSOFTRAST_BLENDMODE_e
186 DPSOFTRAST_BLENDMODE_OPAQUE,
187 DPSOFTRAST_BLENDMODE_ALPHA,
188 DPSOFTRAST_BLENDMODE_ADDALPHA,
189 DPSOFTRAST_BLENDMODE_ADD,
190 DPSOFTRAST_BLENDMODE_INVMOD,
191 DPSOFTRAST_BLENDMODE_MUL,
192 DPSOFTRAST_BLENDMODE_MUL2,
193 DPSOFTRAST_BLENDMODE_SUBALPHA,
194 DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
195 DPSOFTRAST_BLENDMODE_TOTAL
197 DPSOFTRAST_BLENDMODE;
199 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
219 float polygonoffset[2];
222 int shader_permutation;
224 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
226 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
227 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
229 // DPSOFTRAST_VALIDATE_ flags
232 // derived values (DPSOFTRAST_VALIDATE_FB)
234 int fb_clearscissor[4];
236 // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
239 // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
242 ATOMIC(int commandoffset);
251 DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
253 DPSOFTRAST_State_Thread);
255 typedef ATOMIC(struct DPSOFTRAST_State_s
259 unsigned int *fb_depthpixels;
260 unsigned int *fb_colorpixels[4];
263 ALIGN(float fb_viewportcenter[4]);
264 ALIGN(float fb_viewportscale[4]);
267 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
268 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
272 const float *pointer_vertex3f;
273 const float *pointer_color4f;
274 const unsigned char *pointer_color4ub;
275 const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
278 int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
279 int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
280 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
284 float *in_array4f[DPSOFTRAST_ARRAY_TOTAL];
285 float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
286 float *screencoord4f;
289 int shader_permutation;
293 int texture_firstfree;
294 DPSOFTRAST_Texture *texture;
299 const char *errorstring;
302 DPSOFTRAST_State_Thread *threads;
304 SDL_mutex *trianglemutex;
305 SDL_cond *trianglecond;
308 ATOMIC(int drawtriangle);
310 DPSOFTRAST_State_Command_Pool commandpool;
311 DPSOFTRAST_State_Triangle_Pool trianglepool;
315 DPSOFTRAST_State dpsoftrast;
317 extern int dpsoftrast_test;
319 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
320 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
321 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
322 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
323 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
325 void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
327 // calculate framebuffer scissor, viewport, viewport clipped by scissor,
328 // and viewport projection values
331 x1 = thread->scissor[0];
332 x2 = thread->scissor[0] + thread->scissor[2];
333 y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
334 y2 = dpsoftrast.fb_height - thread->scissor[1];
335 if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
337 if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
339 if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
340 thread->fb_clearscissor[0] = x1;
341 thread->fb_clearscissor[1] = y1;
342 thread->fb_clearscissor[2] = x2 - x1;
343 thread->fb_clearscissor[3] = y2 - y1;
346 void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
348 thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
351 void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
353 if (thread->blendsubtract)
355 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
357 #define BLENDFUNC(sfactor, dfactor, blendmode) \
358 case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
359 BLENDFUNC(GL_SRC_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
360 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
365 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
367 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
368 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
369 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
370 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
371 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
372 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
373 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
374 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
375 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
376 BLENDFUNC(GL_SRC_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
377 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
382 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
384 void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
386 mask &= thread->validate;
389 if (mask & DPSOFTRAST_VALIDATE_FB)
391 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
392 DPSOFTRAST_RecalcFB(thread);
394 if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
396 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
397 DPSOFTRAST_RecalcDepthFunc(thread);
399 if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
401 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
402 DPSOFTRAST_RecalcBlendFunc(thread);
406 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
408 if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
409 return &dpsoftrast.texture[index];
413 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
422 int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
423 int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
424 DPSOFTRAST_Texture *texture;
425 if (width*height*depth < 1)
427 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
430 if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
432 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
437 case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
438 case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
439 case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
441 case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
442 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
444 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
449 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
452 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
454 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
459 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
461 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
464 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
466 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
469 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
471 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
474 if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
476 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
479 if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
481 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
485 // find first empty slot in texture array
486 for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
487 if (!dpsoftrast.texture[texnum].bytes)
489 dpsoftrast.texture_firstfree = texnum + 1;
490 if (dpsoftrast.texture_max <= texnum)
492 // expand texture array as needed
493 if (dpsoftrast.texture_max < 1024)
494 dpsoftrast.texture_max = 1024;
496 dpsoftrast.texture_max *= 2;
497 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
499 if (dpsoftrast.texture_end <= texnum)
500 dpsoftrast.texture_end = texnum + 1;
501 texture = &dpsoftrast.texture[texnum];
502 memset(texture, 0, sizeof(*texture));
503 texture->flags = flags;
504 texture->width = width;
505 texture->height = height;
506 texture->depth = depth;
507 texture->sides = sides;
518 s = w * h * d * sides * 4;
519 texture->mipmap[mipmaps][0] = size;
520 texture->mipmap[mipmaps][1] = s;
521 texture->mipmap[mipmaps][2] = w;
522 texture->mipmap[mipmaps][3] = h;
523 texture->mipmap[mipmaps][4] = d;
526 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
532 texture->mipmaps = mipmaps;
533 texture->size = size;
535 // allocate the pixels now
536 texture->bytes = (unsigned char *)MM_CALLOC(1, size);
540 void DPSOFTRAST_Texture_Free(int index)
542 DPSOFTRAST_Texture *texture;
543 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
546 MM_FREE(texture->bytes);
547 texture->bytes = NULL;
548 memset(texture, 0, sizeof(*texture));
549 // adjust the free range and used range
550 if (dpsoftrast.texture_firstfree > index)
551 dpsoftrast.texture_firstfree = index;
552 while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
553 dpsoftrast.texture_end--;
555 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
557 int i, x, y, z, w, layer0, layer1, row0, row1;
558 unsigned char *o, *i0, *i1, *i2, *i3;
559 DPSOFTRAST_Texture *texture;
560 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
561 if (texture->mipmaps <= 1)
563 for (i = 1;i < texture->mipmaps;i++)
565 for (z = 0;z < texture->mipmap[i][4];z++)
569 if (layer1 >= texture->mipmap[i-1][4])
570 layer1 = texture->mipmap[i-1][4]-1;
571 for (y = 0;y < texture->mipmap[i][3];y++)
575 if (row1 >= texture->mipmap[i-1][3])
576 row1 = texture->mipmap[i-1][3]-1;
577 o = texture->bytes + texture->mipmap[i ][0] + 4*((texture->mipmap[i ][3] * z + y ) * texture->mipmap[i ][2]);
578 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
579 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
580 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
581 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
582 w = texture->mipmap[i][2];
585 if (texture->mipmap[i-1][2] > 1)
587 // average 3D texture
588 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
590 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
591 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
592 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
593 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
598 // average 3D mipmap with parent width == 1
599 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
601 o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
602 o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
603 o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
604 o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
610 if (texture->mipmap[i-1][2] > 1)
612 // average 2D texture (common case)
613 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
615 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
616 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
617 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
618 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
623 // 2D texture with parent width == 1
624 o[0] = (i0[0] + i1[0] + 1) >> 1;
625 o[1] = (i0[1] + i1[1] + 1) >> 1;
626 o[2] = (i0[2] + i1[2] + 1) >> 1;
627 o[3] = (i0[3] + i1[3] + 1) >> 1;
634 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
636 DPSOFTRAST_Texture *texture;
638 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
640 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
641 while (blockheight > 0)
643 memcpy(dst, pixels, blockwidth * 4);
644 pixels += blockwidth * 4;
645 dst += texture->mipmap[0][2] * 4;
648 DPSOFTRAST_Texture_CalculateMipmaps(index);
650 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
652 DPSOFTRAST_Texture *texture;
653 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
655 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
656 DPSOFTRAST_Texture_CalculateMipmaps(index);
658 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
660 DPSOFTRAST_Texture *texture;
661 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
662 return texture->mipmap[mip][2];
664 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
666 DPSOFTRAST_Texture *texture;
667 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
668 return texture->mipmap[mip][3];
670 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
672 DPSOFTRAST_Texture *texture;
673 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
674 return texture->mipmap[mip][4];
676 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
678 DPSOFTRAST_Texture *texture;
679 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
681 return texture->bytes + texture->mipmap[mip][0];
683 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
685 DPSOFTRAST_Texture *texture;
686 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
687 if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
689 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
693 texture->filter = filter;
696 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
698 if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
699 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
700 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
702 dpsoftrast.fb_width = width;
703 dpsoftrast.fb_height = height;
704 dpsoftrast.fb_depthpixels = depthpixels;
705 dpsoftrast.fb_colorpixels[0] = colorpixels0;
706 dpsoftrast.fb_colorpixels[1] = colorpixels1;
707 dpsoftrast.fb_colorpixels[2] = colorpixels2;
708 dpsoftrast.fb_colorpixels[3] = colorpixels3;
711 void DPSOFTRAST_Draw_FlushThreads(void);
713 void DPSOFTRAST_Draw_FreeTrianglePool(int space)
715 DPSOFTRAST_State_Thread *thread;
717 int freetriangle = dpsoftrast.trianglepool.freetriangle;
718 int usedtriangles = dpsoftrast.trianglepool.usedtriangles;
719 if (usedtriangles <= DPSOFTRAST_DRAW_MAXTRIANGLEPOOL-space)
722 SDL_LockMutex(dpsoftrast.trianglemutex);
729 for (i = 0; i < dpsoftrast.numthreads; i++)
731 thread = &dpsoftrast.threads[i];
732 triangleoffset = freetriangle - thread->triangleoffset;
733 if (triangleoffset < 0)
734 triangleoffset += DPSOFTRAST_DRAW_MAXTRIANGLEPOOL;
735 if (triangleoffset > usedtriangles)
738 usedtriangles = triangleoffset;
741 if (usedtriangles <= DPSOFTRAST_DRAW_MAXTRIANGLEPOOL-space || waitindex < 0)
744 thread = &dpsoftrast.threads[waitindex];
745 thread->waiting = true;
746 SDL_CondBroadcast(dpsoftrast.trianglecond);
747 SDL_CondWait(thread->waitcond, dpsoftrast.trianglemutex);
748 thread->waiting = false;
752 SDL_UnlockMutex(dpsoftrast.trianglemutex);
754 dpsoftrast.trianglepool.usedtriangles = usedtriangles;
757 void DPSOFTRAST_Draw_SyncCommands(void)
759 DPSOFTRAST_State_Triangle *triangle;
760 if (dpsoftrast.trianglepool.usedtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLEPOOL-1)
762 DPSOFTRAST_Draw_FreeTrianglePool(DPSOFTRAST_DRAW_MAXTRIANGLEPOOL/8);
764 DPSOFTRAST_Draw_FlushThreads();
766 triangle = &dpsoftrast.trianglepool.triangles[dpsoftrast.trianglepool.freetriangle];
767 triangle->commandoffset = dpsoftrast.commandpool.freecommand;
768 triangle->starty = -1;
770 dpsoftrast.trianglepool.freetriangle = dpsoftrast.trianglepool.freetriangle < DPSOFTRAST_DRAW_MAXTRIANGLEPOOL-1 ? dpsoftrast.trianglepool.freetriangle + 1 : 0;
771 dpsoftrast.trianglepool.usedtriangles++;
773 dpsoftrast.drawtriangle = dpsoftrast.trianglepool.freetriangle;
776 void DPSOFTRAST_Draw_FreeCommandPool(int space)
778 DPSOFTRAST_State_Thread *thread;
780 int freecommand = dpsoftrast.commandpool.freecommand;
781 int usedcommands = dpsoftrast.commandpool.usedcommands;
782 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
784 DPSOFTRAST_Draw_SyncCommands();
786 SDL_LockMutex(dpsoftrast.trianglemutex);
793 for (i = 0; i < dpsoftrast.numthreads; i++)
795 thread = &dpsoftrast.threads[i];
796 commandoffset = freecommand - thread->commandoffset;
797 if (commandoffset < 0)
798 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
799 if (commandoffset > usedcommands)
802 usedcommands = commandoffset;
805 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
808 thread = &dpsoftrast.threads[waitindex];
809 thread->waiting = true;
810 SDL_CondBroadcast(dpsoftrast.trianglecond);
811 SDL_CondWait(thread->waitcond, dpsoftrast.trianglemutex);
812 thread->waiting = false;
816 SDL_UnlockMutex(dpsoftrast.trianglemutex);
818 dpsoftrast.commandpool.usedcommands = usedcommands;
821 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
822 ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand(sizeof( DPSOFTRAST_Command_##name ) + ((COMMAND_SIZE - (sizeof( DPSOFTRAST_Command_##name )&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1))))
824 static void *DPSOFTRAST_AllocateCommand(int size)
826 DPSOFTRAST_Command *command;
827 int freecommand = dpsoftrast.commandpool.freecommand;
828 int usedcommands = dpsoftrast.commandpool.usedcommands;
829 int extra = sizeof(DPSOFTRAST_Command);
830 if(DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
831 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
832 if(usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
835 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
837 DPSOFTRAST_Draw_FlushThreads();
839 freecommand = dpsoftrast.commandpool.freecommand;
840 usedcommands = dpsoftrast.commandpool.usedcommands;
842 if(DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
844 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
845 command->opcode = DPSOFTRAST_OPCODE_Reset;
846 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
849 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
851 if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
854 dpsoftrast.commandpool.freecommand = freecommand;
855 dpsoftrast.commandpool.usedcommands = usedcommands + size;
859 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
860 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
862 thread->viewport[0] = command->x;
863 thread->viewport[1] = command->y;
864 thread->viewport[2] = command->width;
865 thread->viewport[3] = command->height;
866 thread->validate |= DPSOFTRAST_VALIDATE_FB;
868 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
870 DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
871 command->opcode = DPSOFTRAST_OPCODE_Viewport;
874 command->width = width;
875 command->height = height;
877 dpsoftrast.viewport[0] = x;
878 dpsoftrast.viewport[1] = y;
879 dpsoftrast.viewport[2] = width;
880 dpsoftrast.viewport[3] = height;
881 dpsoftrast.fb_viewportcenter[1] = dpsoftrast.viewport[0] + 0.5f * dpsoftrast.viewport[2] - 0.5f;
882 dpsoftrast.fb_viewportcenter[2] = dpsoftrast.fb_height - dpsoftrast.viewport[1] - 0.5f * dpsoftrast.viewport[3] - 0.5f;
883 dpsoftrast.fb_viewportcenter[3] = 0.5f;
884 dpsoftrast.fb_viewportcenter[0] = 0.0f;
885 dpsoftrast.fb_viewportscale[1] = 0.5f * dpsoftrast.viewport[2];
886 dpsoftrast.fb_viewportscale[2] = -0.5f * dpsoftrast.viewport[3];
887 dpsoftrast.fb_viewportscale[3] = 0.5f;
888 dpsoftrast.fb_viewportscale[0] = 1.0f;
891 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;)
892 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
894 int i, x1, y1, x2, y2, w, h, x, y, t1, t2;
897 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
898 x1 = thread->fb_clearscissor[0];
899 y1 = thread->fb_clearscissor[1];
900 x2 = thread->fb_clearscissor[2];
901 y2 = thread->fb_clearscissor[1] + thread->fb_clearscissor[3];
902 t1 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
903 t2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
910 // FIXME: honor fb_colormask?
911 c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
912 for (i = 0;i < 4;i++)
914 if (!dpsoftrast.fb_colorpixels[i])
916 for (y = y1;y < y2;y++)
918 p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
919 for (x = x1;x < x2;x++)
924 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
926 DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
927 command->opcode = DPSOFTRAST_OPCODE_ClearColor;
934 DEFCOMMAND(3, ClearDepth, float depth;)
935 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
937 int x1, y1, x2, y2, w, h, x, y, t1, t2;
940 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
941 x1 = thread->fb_clearscissor[0];
942 y1 = thread->fb_clearscissor[1];
943 x2 = thread->fb_clearscissor[2];
944 y2 = thread->fb_clearscissor[1] + thread->fb_clearscissor[3];
945 t1 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
946 t2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
953 c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
954 for (y = y1;y < y2;y++)
956 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
957 for (x = x1;x < x2;x++)
961 void DPSOFTRAST_ClearDepth(float d)
963 DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
964 command->opcode = DPSOFTRAST_OPCODE_ClearDepth;
968 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
969 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
971 thread->colormask[0] = command->r != 0;
972 thread->colormask[1] = command->g != 0;
973 thread->colormask[2] = command->b != 0;
974 thread->colormask[3] = command->a != 0;
975 thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
977 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
979 DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
980 command->opcode = DPSOFTRAST_OPCODE_ColorMask;
987 DEFCOMMAND(5, DepthTest, int enable;)
988 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
990 thread->depthtest = command->enable;
991 thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
993 void DPSOFTRAST_DepthTest(int enable)
995 DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
996 command->opcode = DPSOFTRAST_OPCODE_DepthTest;
997 command->enable = enable;
1000 DEFCOMMAND(6, ScissorTest, int enable;)
1001 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1003 thread->scissortest = command->enable;
1004 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1006 void DPSOFTRAST_ScissorTest(int enable)
1008 DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1009 command->opcode = DPSOFTRAST_OPCODE_ScissorTest;
1010 command->enable = enable;
1013 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1014 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1016 thread->scissor[0] = command->x;
1017 thread->scissor[1] = command->y;
1018 thread->scissor[2] = command->width;
1019 thread->scissor[3] = command->height;
1020 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1022 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1024 DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1025 command->opcode = DPSOFTRAST_OPCODE_Scissor;
1028 command->width = width;
1029 command->height = height;
1032 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1033 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1035 thread->blendfunc[0] = command->sfactor;
1036 thread->blendfunc[1] = command->dfactor;
1037 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1039 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1041 DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1042 command->opcode = DPSOFTRAST_OPCODE_BlendFunc;
1043 command->sfactor = sfactor;
1044 command->dfactor = dfactor;
1047 DEFCOMMAND(9, BlendSubtract, int enable;)
1048 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1050 thread->blendsubtract = command->enable;
1051 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1053 void DPSOFTRAST_BlendSubtract(int enable)
1055 DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1056 command->opcode = DPSOFTRAST_OPCODE_BlendSubtract;
1057 command->enable = enable;
1060 DEFCOMMAND(10, DepthMask, int enable;)
1061 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1063 thread->depthmask = command->enable;
1065 void DPSOFTRAST_DepthMask(int enable)
1067 DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1068 command->opcode = DPSOFTRAST_OPCODE_DepthMask;
1069 command->enable = enable;
1072 DEFCOMMAND(11, DepthFunc, int func;)
1073 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1075 thread->depthfunc = command->func;
1077 void DPSOFTRAST_DepthFunc(int func)
1079 DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1080 command->opcode = DPSOFTRAST_OPCODE_DepthFunc;
1081 command->func = func;
1084 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1085 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1087 thread->depthrange[0] = command->nearval;
1088 thread->depthrange[1] = command->farval;
1090 void DPSOFTRAST_DepthRange(float nearval, float farval)
1092 DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1093 command->opcode = DPSOFTRAST_OPCODE_DepthRange;
1094 command->nearval = nearval;
1095 command->farval = farval;
1098 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1099 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1101 thread->polygonoffset[0] = command->alongnormal;
1102 thread->polygonoffset[1] = command->intoview;
1104 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1106 DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1107 command->opcode = DPSOFTRAST_OPCODE_PolygonOffset;
1108 command->alongnormal = alongnormal;
1109 command->intoview = intoview;
1112 void DPSOFTRAST_CullFace(int mode)
1114 dpsoftrast.cullface = mode;
1117 DEFCOMMAND(15, AlphaTest, int enable;)
1118 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1120 thread->alphatest = command->enable;
1122 void DPSOFTRAST_AlphaTest(int enable)
1124 DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1125 command->opcode = DPSOFTRAST_OPCODE_AlphaTest;
1126 command->enable = enable;
1129 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1130 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1132 thread->alphafunc = command->func;
1133 thread->alphavalue = command->ref;
1135 void DPSOFTRAST_AlphaFunc(int func, float ref)
1137 DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1138 command->opcode = DPSOFTRAST_OPCODE_AlphaFunc;
1139 command->func = func;
1143 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1145 dpsoftrast.color[0] = r;
1146 dpsoftrast.color[1] = g;
1147 dpsoftrast.color[2] = b;
1148 dpsoftrast.color[3] = a;
1151 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1153 int outstride = blockwidth * 4;
1154 int instride = dpsoftrast.fb_width * 4;
1157 int bx2 = blockx + blockwidth;
1158 int by2 = blocky + blockheight;
1163 unsigned char *inpixels;
1167 if (bx1 < 0) bx1 = 0;
1168 if (by1 < 0) by1 = 0;
1169 if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1170 if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1173 inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1174 if (dpsoftrast.bigendian)
1176 for (y = by1;y < by2;y++)
1178 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1179 o = (unsigned char *)outpixels + (y - by1) * outstride;
1180 for (x = bx1;x < bx2;x++)
1193 for (y = by1;y < by2;y++)
1195 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1196 o = (unsigned char *)outpixels + (y - by1) * outstride;
1202 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1206 int tx2 = tx + width;
1207 int ty2 = ty + height;
1210 int sx2 = sx + width;
1211 int sy2 = sy + height;
1221 unsigned int *spixels;
1222 unsigned int *tpixels;
1223 DPSOFTRAST_Texture *texture;
1224 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1225 if (mip < 0 || mip >= texture->mipmaps) return;
1227 spixels = dpsoftrast.fb_colorpixels[0];
1228 swidth = dpsoftrast.fb_width;
1229 sheight = dpsoftrast.fb_height;
1230 tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1231 twidth = texture->mipmap[mip][2];
1232 theight = texture->mipmap[mip][3];
1233 if (tx1 < 0) tx1 = 0;
1234 if (ty1 < 0) ty1 = 0;
1235 if (tx2 > twidth) tx2 = twidth;
1236 if (ty2 > theight) ty2 = theight;
1237 if (sx1 < 0) sx1 = 0;
1238 if (sy1 < 0) sy1 = 0;
1239 if (sx2 > swidth) sx2 = swidth;
1240 if (sy2 > sheight) sy2 = sheight;
1245 if (tw > sw) tw = sw;
1246 if (th > sh) th = sh;
1247 if (tw < 1 || th < 1)
1249 for (y = 0;y < th;y++)
1250 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 + y) * swidth + sx1), tw*4);
1251 if (texture->mipmaps > 1)
1252 DPSOFTRAST_Texture_CalculateMipmaps(index);
1255 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1256 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1258 thread->texbound[command->unitnum] = command->texture;
1260 void DPSOFTRAST_SetTexture(int unitnum, int index)
1262 DPSOFTRAST_Command_SetTexture *command;
1263 DPSOFTRAST_Texture *texture;
1264 if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1266 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1269 texture = DPSOFTRAST_Texture_GetByIndex(index);
1270 if (index && !texture)
1272 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1276 command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1277 command->opcode = DPSOFTRAST_OPCODE_SetTexture;
1278 command->unitnum = unitnum;
1279 command->texture = texture;
1281 dpsoftrast.texbound[unitnum] = texture;
1284 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1286 dpsoftrast.pointer_vertex3f = vertex3f;
1287 dpsoftrast.stride_vertex = stride;
1289 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1291 dpsoftrast.pointer_color4f = color4f;
1292 dpsoftrast.pointer_color4ub = NULL;
1293 dpsoftrast.stride_color = stride;
1295 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1297 dpsoftrast.pointer_color4f = NULL;
1298 dpsoftrast.pointer_color4ub = color4ub;
1299 dpsoftrast.stride_color = stride;
1301 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1303 dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1304 dpsoftrast.components_texcoord[unitnum] = numcomponents;
1305 dpsoftrast.stride_texcoord[unitnum] = stride;
1308 DEFCOMMAND(18, SetShader, int mode; int permutation;)
1309 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1311 thread->shader_mode = command->mode;
1312 thread->shader_permutation = command->permutation;
1314 void DPSOFTRAST_SetShader(int mode, int permutation)
1316 DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1317 command->opcode = DPSOFTRAST_OPCODE_SetShader;
1318 command->mode = mode;
1319 command->permutation = permutation;
1321 dpsoftrast.shader_mode = mode;
1322 dpsoftrast.shader_permutation = permutation;
1325 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1326 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1328 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1330 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1332 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1333 command->opcode = DPSOFTRAST_OPCODE_Uniform4f;
1334 command->index = index;
1335 command->val[0] = v0;
1336 command->val[1] = v1;
1337 command->val[2] = v2;
1338 command->val[3] = v3;
1340 dpsoftrast.uniform4f[index*4+0] = v0;
1341 dpsoftrast.uniform4f[index*4+1] = v1;
1342 dpsoftrast.uniform4f[index*4+2] = v2;
1343 dpsoftrast.uniform4f[index*4+3] = v3;
1345 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1347 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1348 command->opcode = DPSOFTRAST_OPCODE_Uniform4f;
1349 command->index = index;
1350 memcpy(command->val, v, sizeof(command->val));
1352 memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1355 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1356 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1358 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1360 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1364 for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1366 __m128 m0, m1, m2, m3;
1367 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1368 command->opcode = DPSOFTRAST_OPCODE_UniformMatrix4f;
1369 command->index = index;
1370 if (((size_t)v)&(ALIGN_SIZE-1))
1372 m0 = _mm_loadu_ps(v);
1373 m1 = _mm_loadu_ps(v+4);
1374 m2 = _mm_loadu_ps(v+8);
1375 m3 = _mm_loadu_ps(v+12);
1379 m0 = _mm_load_ps(v);
1380 m1 = _mm_load_ps(v+4);
1381 m2 = _mm_load_ps(v+8);
1382 m3 = _mm_load_ps(v+12);
1386 __m128 t0, t1, t2, t3;
1387 t0 = _mm_unpacklo_ps(m0, m1);
1388 t1 = _mm_unpacklo_ps(m2, m3);
1389 t2 = _mm_unpackhi_ps(m0, m1);
1390 t3 = _mm_unpackhi_ps(m2, m3);
1391 m0 = _mm_movelh_ps(t0, t1);
1392 m1 = _mm_movehl_ps(t1, t0);
1393 m2 = _mm_movelh_ps(t2, t3);
1394 m3 = _mm_movehl_ps(t3, t2);
1396 _mm_store_ps(command->val, m0);
1397 _mm_store_ps(command->val+4, m1);
1398 _mm_store_ps(command->val+8, m2);
1399 _mm_store_ps(command->val+12, m3);
1400 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1401 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1402 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1403 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1408 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1409 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1411 thread->uniform1i[command->index] = command->val;
1413 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1415 DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1416 command->opcode = DPSOFTRAST_OPCODE_Uniform1i;
1417 command->index = index;
1420 dpsoftrast.uniform1i[command->index] = i0;
1424 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1426 float *end = dst + size*4;
1427 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1431 _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1440 _mm_store_ps(dst, _mm_load_ps((const float *)src));
1447 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1449 float *end = dst + size*4;
1450 if (stride == sizeof(float[3]))
1452 float *end4 = dst + (size&~3)*4;
1453 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1457 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv;
1458 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1459 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1460 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1461 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1462 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1463 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1464 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1465 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1466 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1467 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1468 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1469 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1471 src += 4*sizeof(float[3]);
1478 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1479 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1480 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1481 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1482 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1483 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1484 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1485 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1486 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1487 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1488 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1489 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1490 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1492 src += 4*sizeof(float[3]);
1496 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1500 __m128 v = _mm_loadu_ps((const float *)src);
1501 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1502 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1503 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1504 _mm_store_ps(dst, v);
1513 __m128 v = _mm_load_ps((const float *)src);
1514 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1515 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1516 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1517 _mm_store_ps(dst, v);
1524 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1526 float *end = dst + size*4;
1527 __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1528 if (stride == sizeof(float[2]))
1530 float *end2 = dst + (size&~1)*4;
1531 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1535 __m128 v = _mm_loadu_ps((const float *)src);
1536 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1537 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1539 src += 2*sizeof(float[2]);
1546 __m128 v = _mm_load_ps((const float *)src);
1547 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1548 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1550 src += 2*sizeof(float[2]);
1556 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1562 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1564 float *end = dst + size*4;
1565 __m128 scale = _mm_set1_ps(1.0f/255.0f);
1566 if (stride == sizeof(unsigned char[4]))
1568 float *end4 = dst + (size&~3)*4;
1569 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1573 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1574 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1575 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1576 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1577 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1579 src += 4*sizeof(unsigned char[4]);
1586 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1587 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1588 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1589 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1590 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1592 src += 4*sizeof(unsigned char[4]);
1598 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1599 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1605 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1607 float *end = dst + 4*size;
1608 __m128 v = _mm_loadu_ps(src);
1611 _mm_store_ps(dst, v);
1617 void DPSOFTRAST_Draw_LoadVertices(int firstvertex, int numvertices, bool needcolors)
1626 const unsigned char *b;
1627 dpsoftrast.numvertices = numvertices;
1628 if (dpsoftrast.maxvertices < dpsoftrast.numvertices)
1630 if (dpsoftrast.maxvertices < 4096)
1631 dpsoftrast.maxvertices = 4096;
1632 while (dpsoftrast.maxvertices < dpsoftrast.numvertices)
1633 dpsoftrast.maxvertices *= 2;
1634 if (dpsoftrast.in_array4f[0])
1635 MM_FREE(dpsoftrast.in_array4f[0]);
1636 data = (float *)MM_CALLOC(1, dpsoftrast.maxvertices * sizeof(float[4])*(DPSOFTRAST_ARRAY_TOTAL*2 + 1));
1637 for (i = 0;i < DPSOFTRAST_ARRAY_TOTAL;i++, data += dpsoftrast.maxvertices * 4)
1638 dpsoftrast.in_array4f[i] = data;
1639 for (i = 0;i < DPSOFTRAST_ARRAY_TOTAL;i++, data += dpsoftrast.maxvertices * 4)
1640 dpsoftrast.post_array4f[i] = data;
1641 dpsoftrast.screencoord4f = data;
1642 data += dpsoftrast.maxvertices * 4;
1644 stride = dpsoftrast.stride_vertex;
1645 v = (const float *)((unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride);
1646 p = dpsoftrast.in_array4f[0];
1647 DPSOFTRAST_Load3fTo4f(p, (const unsigned char *)v, numvertices, stride);
1650 if (dpsoftrast.pointer_color4f)
1652 stride = dpsoftrast.stride_color;
1653 v = (const float *)((const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride);
1654 p = dpsoftrast.in_array4f[1];
1655 DPSOFTRAST_Load4fTo4f(p, (const unsigned char *)v, numvertices, stride);
1657 else if (dpsoftrast.pointer_color4ub)
1659 stride = dpsoftrast.stride_color;
1660 b = (const unsigned char *)((const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride);
1661 p = dpsoftrast.in_array4f[1];
1662 DPSOFTRAST_Load4bTo4f(p, b, numvertices, stride);
1666 p = dpsoftrast.in_array4f[1];
1667 DPSOFTRAST_Fill4f(p, dpsoftrast.color, numvertices);
1670 for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL-2;j++)
1672 if (dpsoftrast.pointer_texcoordf[j])
1674 stride = dpsoftrast.stride_texcoord[j];
1675 v = (const float *)((const unsigned char *)dpsoftrast.pointer_texcoordf[j] + firstvertex * stride);
1676 p = dpsoftrast.in_array4f[j+2];
1677 switch(dpsoftrast.components_texcoord[j])
1680 DPSOFTRAST_Load2fTo4f(p, (const unsigned char *)v, numvertices, stride);
1683 DPSOFTRAST_Load3fTo4f(p, (const unsigned char *)v, numvertices, stride);
1686 DPSOFTRAST_Load4fTo4f(p, (const unsigned char *)v, numvertices, stride);
1694 void DPSOFTRAST_Array_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1697 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1698 __m128 m0, m1, m2, m3;
1699 float *end = out4f + numitems*4;
1700 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1702 // fast case for identity matrix
1703 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1706 m0 = _mm_loadu_ps(inmatrix16f);
1707 m1 = _mm_loadu_ps(inmatrix16f + 4);
1708 m2 = _mm_loadu_ps(inmatrix16f + 8);
1709 m3 = _mm_loadu_ps(inmatrix16f + 12);
1710 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1714 __m128 v = _mm_loadu_ps(in4f);
1716 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1717 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1718 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1719 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1728 __m128 v = _mm_load_ps(in4f);
1730 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1731 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1732 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1733 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1741 void DPSOFTRAST_Array_Copy(float *out4f, const float *in4f, int numitems)
1743 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1747 static __m128 DPSOFTRAST_Draw_ProjectVertex(__m128 v)
1749 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1750 __m128 w = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3));
1751 v = _mm_move_ss(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f));
1752 v = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, v), w));
1753 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1758 void DPSOFTRAST_Array_Project(float *out4f, float *screen4f, const float *in4f, int numitems)
1761 float *end = out4f + numitems*4;
1762 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1765 __m128 v = _mm_load_ps(in4f), w = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3));
1766 _mm_store_ps(out4f, v);
1767 v = _mm_move_ss(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f));
1768 v = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, v), w));
1769 _mm_store_ps(screen4f, _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1)));
1777 void DPSOFTRAST_Array_TransformProject(float *out4f, float *screen4f, const float *in4f, int numitems, const float *inmatrix16f)
1780 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1781 __m128 m0, m1, m2, m3, viewportcenter, viewportscale;
1782 float *end = out4f + numitems*4;
1783 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1785 DPSOFTRAST_Array_Project(out4f, screen4f, in4f, numitems);
1788 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1789 viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1790 m0 = _mm_loadu_ps(inmatrix16f);
1791 m1 = _mm_loadu_ps(inmatrix16f + 4);
1792 m2 = _mm_loadu_ps(inmatrix16f + 8);
1793 m3 = _mm_loadu_ps(inmatrix16f + 12);
1794 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1798 __m128 v = _mm_loadu_ps(in4f), w;
1799 v = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1800 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1801 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1802 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3))));
1803 _mm_store_ps(out4f, v);
1804 w = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3));
1805 v = _mm_move_ss(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f));
1806 v = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, v), w));
1807 _mm_store_ps(screen4f, _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1)));
1817 __m128 v = _mm_load_ps(in4f), w;
1818 v = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1819 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1820 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1821 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3))));
1822 _mm_store_ps(out4f, v);
1823 w = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3));
1824 v = _mm_move_ss(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f));
1825 v = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, v), w));
1826 _mm_store_ps(screen4f, _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1)));
1835 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1838 int startx = span->startx;
1839 int endx = span->endx;
1840 float wslope = triangle->w[0];
1841 float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1842 float endz = 1.0f / (w + wslope * startx);
1843 for (x = startx;x < endx;)
1845 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1847 if(nextsub >= endx) nextsub = endsub = endx-1;
1848 endz = 1.0f / (w + wslope * nextsub);
1849 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1850 for (; x <= endsub; x++, z += dz)
1855 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
1858 int startx = span->startx;
1859 int endx = span->endx;
1862 unsigned char * RESTRICT pixelmask = span->pixelmask;
1863 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1866 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
1867 // handle alphatest now (this affects depth writes too)
1868 if (thread->alphatest)
1869 for (x = startx;x < endx;x++)
1870 if (in4f[x*4+3] < 0.5f)
1871 pixelmask[x] = false;
1872 // FIXME: this does not handle bigendian
1873 switch(thread->fb_blendmode)
1875 case DPSOFTRAST_BLENDMODE_OPAQUE:
1876 for (x = startx;x < endx;x++)
1880 d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
1881 d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
1882 d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
1883 d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
1884 pixel[x*4+0] = d[0];
1885 pixel[x*4+1] = d[1];
1886 pixel[x*4+2] = d[2];
1887 pixel[x*4+3] = d[3];
1890 case DPSOFTRAST_BLENDMODE_ALPHA:
1891 for (x = startx;x < endx;x++)
1895 a = in4f[x*4+3] * 255.0f;
1896 b = 1.0f - in4f[x*4+3];
1897 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
1898 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
1899 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
1900 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
1901 pixel[x*4+0] = d[0];
1902 pixel[x*4+1] = d[1];
1903 pixel[x*4+2] = d[2];
1904 pixel[x*4+3] = d[3];
1907 case DPSOFTRAST_BLENDMODE_ADDALPHA:
1908 for (x = startx;x < endx;x++)
1912 a = in4f[x*4+3] * 255.0f;
1913 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
1914 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
1915 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
1916 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
1917 pixel[x*4+0] = d[0];
1918 pixel[x*4+1] = d[1];
1919 pixel[x*4+2] = d[2];
1920 pixel[x*4+3] = d[3];
1923 case DPSOFTRAST_BLENDMODE_ADD:
1924 for (x = startx;x < endx;x++)
1928 d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
1929 d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
1930 d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
1931 d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
1932 pixel[x*4+0] = d[0];
1933 pixel[x*4+1] = d[1];
1934 pixel[x*4+2] = d[2];
1935 pixel[x*4+3] = d[3];
1938 case DPSOFTRAST_BLENDMODE_INVMOD:
1939 for (x = startx;x < endx;x++)
1943 d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
1944 d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
1945 d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
1946 d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
1947 pixel[x*4+0] = d[0];
1948 pixel[x*4+1] = d[1];
1949 pixel[x*4+2] = d[2];
1950 pixel[x*4+3] = d[3];
1953 case DPSOFTRAST_BLENDMODE_MUL:
1954 for (x = startx;x < endx;x++)
1958 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
1959 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
1960 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
1961 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
1962 pixel[x*4+0] = d[0];
1963 pixel[x*4+1] = d[1];
1964 pixel[x*4+2] = d[2];
1965 pixel[x*4+3] = d[3];
1968 case DPSOFTRAST_BLENDMODE_MUL2:
1969 for (x = startx;x < endx;x++)
1973 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
1974 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
1975 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
1976 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
1977 pixel[x*4+0] = d[0];
1978 pixel[x*4+1] = d[1];
1979 pixel[x*4+2] = d[2];
1980 pixel[x*4+3] = d[3];
1983 case DPSOFTRAST_BLENDMODE_SUBALPHA:
1984 for (x = startx;x < endx;x++)
1988 a = in4f[x*4+3] * -255.0f;
1989 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
1990 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
1991 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
1992 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
1993 pixel[x*4+0] = d[0];
1994 pixel[x*4+1] = d[1];
1995 pixel[x*4+2] = d[2];
1996 pixel[x*4+3] = d[3];
1999 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2000 for (x = startx;x < endx;x++)
2005 b = 1.0f - in4f[x*4+3];
2006 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2007 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2008 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2009 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2010 pixel[x*4+0] = d[0];
2011 pixel[x*4+1] = d[1];
2012 pixel[x*4+2] = d[2];
2013 pixel[x*4+3] = d[3];
2019 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2023 int startx = span->startx;
2024 int endx = span->endx;
2025 const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2026 unsigned char * RESTRICT pixelmask = span->pixelmask;
2027 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2028 unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2031 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2032 pixeli += span->y * dpsoftrast.fb_width + span->x;
2033 // handle alphatest now (this affects depth writes too)
2034 if (thread->alphatest)
2035 for (x = startx;x < endx;x++)
2036 if (in4ub[x*4+3] < 0.5f)
2037 pixelmask[x] = false;
2038 // FIXME: this does not handle bigendian
2039 switch(thread->fb_blendmode)
2041 case DPSOFTRAST_BLENDMODE_OPAQUE:
2042 for (x = startx;x + 4 <= endx;)
2044 if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2046 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2060 case DPSOFTRAST_BLENDMODE_ALPHA:
2061 #define FINISHBLEND(blend2, blend1) \
2062 for (x = startx;x + 2 <= endx;x += 2) \
2065 switch (*(const unsigned short*)&pixelmask[x]) \
2068 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2069 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2071 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2074 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2075 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2077 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2080 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2081 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2083 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2088 for(;x < endx; x++) \
2091 if (!pixelmask[x]) \
2093 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2094 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2096 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2100 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2101 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2103 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2104 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2107 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2109 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2110 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2112 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2113 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2116 case DPSOFTRAST_BLENDMODE_ADD:
2117 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2119 case DPSOFTRAST_BLENDMODE_INVMOD:
2121 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2123 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2126 case DPSOFTRAST_BLENDMODE_MUL:
2127 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2129 case DPSOFTRAST_BLENDMODE_MUL2:
2130 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2132 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2134 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2135 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2137 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2138 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2141 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2143 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2144 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2146 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2147 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2154 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2157 int startx = span->startx;
2158 int endx = span->endx;
2163 float tc[2], endtc[2];
2165 unsigned int tci[2];
2166 unsigned int tci1[2];
2167 unsigned int tcimin[2];
2168 unsigned int tcimax[2];
2173 const unsigned char * RESTRICT pixelbase;
2174 const unsigned char * RESTRICT pixel[4];
2175 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2176 // if no texture is bound, just fill it with white
2179 for (x = startx;x < endx;x++)
2181 out4f[x*4+0] = 1.0f;
2182 out4f[x*4+1] = 1.0f;
2183 out4f[x*4+2] = 1.0f;
2184 out4f[x*4+3] = 1.0f;
2188 mip = triangle->mip[texunitindex];
2189 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2190 // if this mipmap of the texture is 1 pixel, just fill it with that color
2191 if (texture->mipmap[mip][1] == 4)
2193 c[0] = texture->bytes[2] * (1.0f/255.0f);
2194 c[1] = texture->bytes[1] * (1.0f/255.0f);
2195 c[2] = texture->bytes[0] * (1.0f/255.0f);
2196 c[3] = texture->bytes[3] * (1.0f/255.0f);
2197 for (x = startx;x < endx;x++)
2199 out4f[x*4+0] = c[0];
2200 out4f[x*4+1] = c[1];
2201 out4f[x*4+2] = c[2];
2202 out4f[x*4+3] = c[3];
2206 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2207 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2208 flags = texture->flags;
2209 tcscale[0] = texture->mipmap[mip][2];
2210 tcscale[1] = texture->mipmap[mip][3];
2211 tciwidth = texture->mipmap[mip][2];
2214 tcimax[0] = texture->mipmap[mip][2]-1;
2215 tcimax[1] = texture->mipmap[mip][3]-1;
2216 tciwrapmask[0] = texture->mipmap[mip][2]-1;
2217 tciwrapmask[1] = texture->mipmap[mip][3]-1;
2218 endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
2219 endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
2220 for (x = startx;x < endx;)
2222 unsigned int subtc[2];
2223 unsigned int substep[2];
2224 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2225 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2228 nextsub = endsub = endx-1;
2229 if(x < nextsub) subscale = 65536.0f / (nextsub - x);
2233 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
2234 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
2235 substep[0] = (endtc[0] - tc[0]) * subscale;
2236 substep[1] = (endtc[1] - tc[1]) * subscale;
2237 subtc[0] = tc[0] * (1<<16);
2238 subtc[1] = tc[1] * (1<<16);
2241 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2243 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2245 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2246 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2247 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2248 tci[0] = subtc[0]>>16;
2249 tci[1] = subtc[1]>>16;
2250 tci1[0] = tci[0] + 1;
2251 tci1[1] = tci[1] + 1;
2252 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2253 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2254 tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2255 tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2256 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2257 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2258 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2259 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2260 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2261 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2262 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2263 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2264 out4f[x*4+0] = c[0];
2265 out4f[x*4+1] = c[1];
2266 out4f[x*4+2] = c[2];
2267 out4f[x*4+3] = c[3];
2272 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2274 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2275 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2276 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2277 tci[0] = subtc[0]>>16;
2278 tci[1] = subtc[1]>>16;
2279 tci1[0] = tci[0] + 1;
2280 tci1[1] = tci[1] + 1;
2281 tci[0] &= tciwrapmask[0];
2282 tci[1] &= tciwrapmask[1];
2283 tci1[0] &= tciwrapmask[0];
2284 tci1[1] &= tciwrapmask[1];
2285 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2286 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2287 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2288 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2289 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2290 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2291 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2292 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2293 out4f[x*4+0] = c[0];
2294 out4f[x*4+1] = c[1];
2295 out4f[x*4+2] = c[2];
2296 out4f[x*4+3] = c[3];
2300 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2302 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2304 tci[0] = subtc[0]>>16;
2305 tci[1] = subtc[1]>>16;
2306 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2307 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2308 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2309 c[0] = pixel[0][2] * (1.0f / 255.0f);
2310 c[1] = pixel[0][1] * (1.0f / 255.0f);
2311 c[2] = pixel[0][0] * (1.0f / 255.0f);
2312 c[3] = pixel[0][3] * (1.0f / 255.0f);
2313 out4f[x*4+0] = c[0];
2314 out4f[x*4+1] = c[1];
2315 out4f[x*4+2] = c[2];
2316 out4f[x*4+3] = c[3];
2321 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2323 tci[0] = subtc[0]>>16;
2324 tci[1] = subtc[1]>>16;
2325 tci[0] &= tciwrapmask[0];
2326 tci[1] &= tciwrapmask[1];
2327 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2328 c[0] = pixel[0][2] * (1.0f / 255.0f);
2329 c[1] = pixel[0][1] * (1.0f / 255.0f);
2330 c[2] = pixel[0][0] * (1.0f / 255.0f);
2331 c[3] = pixel[0][3] * (1.0f / 255.0f);
2332 out4f[x*4+0] = c[0];
2333 out4f[x*4+1] = c[1];
2334 out4f[x*4+2] = c[2];
2335 out4f[x*4+3] = c[3];
2341 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2345 int startx = span->startx;
2346 int endx = span->endx;
2348 __m128 data, slope, tcscale;
2349 __m128i tcsize, tcmask, tcoffset, tcmax;
2351 __m128i subtc, substep, endsubtc;
2354 unsigned int *outi = (unsigned int *)out4ub;
2355 const unsigned char * RESTRICT pixelbase;
2356 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2357 // if no texture is bound, just fill it with white
2360 memset(out4ub + startx*4, 255, span->length*4);
2363 mip = triangle->mip[texunitindex];
2364 pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2365 // if this mipmap of the texture is 1 pixel, just fill it with that color
2366 if (texture->mipmap[mip][1] == 4)
2368 unsigned int k = *((const unsigned int *)pixelbase);
2369 for (x = startx;x < endx;x++)
2373 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2374 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2375 flags = texture->flags;
2376 tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2377 tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2378 tcscale = _mm_cvtepi32_ps(tcsize);
2379 data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2380 slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2381 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
2382 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2383 tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2384 tcmax = filter ? _mm_packs_epi32(tcmask, tcmask) : _mm_slli_epi32(tcmask, 16);
2385 for (x = startx;x < endx;)
2387 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2388 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2391 nextsub = endsub = endx-1;
2392 if(x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2396 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
2397 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2398 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2399 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2400 substep = _mm_slli_epi32(substep, 1);
2403 __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2404 if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2406 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2408 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2409 tci = _mm_madd_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 0x10000, 0, 0x10000)), tcoffset);
2410 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[_mm_cvtsi128_si32(tci)]), _mm_setzero_si128());
2411 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))]), _mm_setzero_si128());
2412 pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), _mm_setzero_si128());
2413 pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))]), _mm_setzero_si128());
2414 fracm = _mm_srli_epi16(subtc, 1);
2415 pix1 = _mm_add_epi16(pix1,
2416 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2417 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2418 pix3 = _mm_add_epi16(pix3,
2419 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2420 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2421 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2422 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2423 pix2 = _mm_add_epi16(pix2,
2424 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2425 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2426 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2430 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2431 tci = _mm_madd_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 0x10000, 0, 0)), tcoffset);
2432 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[_mm_cvtsi128_si32(tci)]), _mm_setzero_si128());
2433 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))]), _mm_setzero_si128());
2434 fracm = _mm_srli_epi16(subtc, 1);
2435 pix1 = _mm_add_epi16(pix1,
2436 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2437 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2438 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2439 pix1 = _mm_add_epi16(pix1,
2440 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2441 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2442 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2446 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2448 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2450 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2451 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2452 tci = _mm_madd_epi16(tci, tcoffset);
2453 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2454 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2455 _mm_setzero_si128());
2456 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2457 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2458 _mm_setzero_si128());
2459 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2460 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2461 tci = _mm_madd_epi16(tci, tcoffset);
2462 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2463 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2464 _mm_setzero_si128());
2465 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2466 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2467 _mm_setzero_si128());
2468 fracm = _mm_srli_epi16(subtc, 1);
2469 pix1 = _mm_add_epi16(pix1,
2470 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2471 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2472 pix3 = _mm_add_epi16(pix3,
2473 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2474 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2475 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2476 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2477 pix2 = _mm_add_epi16(pix2,
2478 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2479 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2480 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2484 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2485 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2486 tci = _mm_madd_epi16(tci, tcoffset);
2487 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2488 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2489 _mm_setzero_si128());
2490 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2491 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2492 _mm_setzero_si128());
2493 fracm = _mm_srli_epi16(subtc, 1);
2494 pix1 = _mm_add_epi16(pix1,
2495 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2496 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2497 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2498 pix1 = _mm_add_epi16(pix1,
2499 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2500 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2501 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2507 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2509 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2510 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2511 tci = _mm_madd_epi16(tci, tcoffset);
2512 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2513 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2514 _mm_setzero_si128());
2515 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2516 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2517 _mm_setzero_si128());
2518 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2519 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2520 tci = _mm_madd_epi16(tci, tcoffset);
2521 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2522 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2523 _mm_setzero_si128());
2524 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2525 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2526 _mm_setzero_si128());
2527 fracm = _mm_srli_epi16(subtc, 1);
2528 pix1 = _mm_add_epi16(pix1,
2529 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2530 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2531 pix3 = _mm_add_epi16(pix3,
2532 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2533 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2534 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2535 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2536 pix2 = _mm_add_epi16(pix2,
2537 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2538 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2539 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2543 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2544 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2545 tci = _mm_madd_epi16(tci, tcoffset);
2546 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2547 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2548 _mm_setzero_si128());
2549 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2550 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2551 _mm_setzero_si128());
2552 fracm = _mm_srli_epi16(subtc, 1);
2553 pix1 = _mm_add_epi16(pix1,
2554 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2555 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2556 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2557 pix1 = _mm_add_epi16(pix1,
2558 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2559 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2560 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2567 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2569 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2571 __m128i tci = _mm_min_epi16(_mm_max_epi16(subtc, _mm_setzero_si128()), tcmax);
2572 tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(tci, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2573 tci = _mm_madd_epi16(tci, tcoffset);
2574 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2575 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))];
2579 __m128i tci = _mm_min_epi16(_mm_max_epi16(subtc, _mm_setzero_si128()), tcmax);
2580 tci = _mm_shufflelo_epi16(tci, _MM_SHUFFLE(3, 1, 3, 1));
2581 tci = _mm_madd_epi16(tci, tcoffset);
2582 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2588 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2590 __m128i tci = _mm_and_si128(subtc, tcmax);
2591 tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(tci, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2592 tci = _mm_madd_epi16(tci, tcoffset);
2593 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2594 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))];
2598 __m128i tci = _mm_and_si128(subtc, tcmax);
2599 tci = _mm_shufflelo_epi16(tci, _MM_SHUFFLE(3, 1, 3, 1));
2600 tci = _mm_madd_epi16(tci, tcoffset);
2601 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2610 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2613 memset(out4ub, 255, span->length*4);
2616 float DPSOFTRAST_SampleShadowmap(const float *vector)
2622 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2625 int startx = span->startx;
2626 int endx = span->endx;
2631 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2632 for (x = startx;x < endx;x++)
2635 c[0] = (data[0] + slope[0]*x) * z;
2636 c[1] = (data[1] + slope[1]*x) * z;
2637 c[2] = (data[2] + slope[2]*x) * z;
2638 c[3] = (data[3] + slope[3]*x) * z;
2639 out4f[x*4+0] = in4f[x*4+0] * c[0];
2640 out4f[x*4+1] = in4f[x*4+1] * c[1];
2641 out4f[x*4+2] = in4f[x*4+2] * c[2];
2642 out4f[x*4+3] = in4f[x*4+3] * c[3];
2646 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2649 int startx = span->startx;
2650 int endx = span->endx;
2655 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2656 for (x = startx;x < endx;x++)
2659 c[0] = (data[0] + slope[0]*x) * z;
2660 c[1] = (data[1] + slope[1]*x) * z;
2661 c[2] = (data[2] + slope[2]*x) * z;
2662 c[3] = (data[3] + slope[3]*x) * z;
2663 out4f[x*4+0] = c[0];
2664 out4f[x*4+1] = c[1];
2665 out4f[x*4+2] = c[2];
2666 out4f[x*4+3] = c[3];
2670 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2672 int x, startx = span->startx, endx = span->endx;
2673 float c[4], localcolor[4];
2674 localcolor[0] = subcolor[0];
2675 localcolor[1] = subcolor[1];
2676 localcolor[2] = subcolor[2];
2677 localcolor[3] = subcolor[3];
2678 for (x = startx;x < endx;x++)
2680 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2681 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2682 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2683 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2684 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2685 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2686 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2687 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2691 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2693 int x, startx = span->startx, endx = span->endx;
2694 for (x = startx;x < endx;x++)
2696 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2697 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2698 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2699 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2703 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2705 int x, startx = span->startx, endx = span->endx;
2706 for (x = startx;x < endx;x++)
2708 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2709 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2710 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2711 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2715 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2717 int x, startx = span->startx, endx = span->endx;
2719 for (x = startx;x < endx;x++)
2721 a = 1.0f - inb4f[x*4+3];
2723 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2724 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2725 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2726 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2730 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2732 int x, startx = span->startx, endx = span->endx;
2733 float localcolor[4], ilerp, lerp;
2734 localcolor[0] = color[0];
2735 localcolor[1] = color[1];
2736 localcolor[2] = color[2];
2737 localcolor[3] = color[3];
2738 ilerp = 1.0f - localcolor[3];
2739 lerp = localcolor[3];
2740 for (x = startx;x < endx;x++)
2742 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2743 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2744 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2745 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2751 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2755 int startx = span->startx;
2756 int endx = span->endx;
2758 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2759 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2760 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2761 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
2762 data = _mm_mul_ps(data, _mm_set1_ps(256.0f));
2763 slope = _mm_mul_ps(slope, _mm_set1_ps(256.0f));
2764 for (x = startx;x+2 <= endx;x += 2, data = _mm_add_ps(data, slope))
2766 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2767 __m128i mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x]))), mod2;
2768 data = _mm_add_ps(data, slope);
2769 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x+1])));
2770 mod = _mm_unpacklo_epi64(_mm_packs_epi32(mod, mod), _mm_packs_epi32(mod2, mod2));
2771 pix = _mm_mulhi_epu16(pix, mod);
2772 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2774 for (;x < endx;x++, data = _mm_add_ps(data, slope))
2776 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2777 __m128i mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x])));
2778 mod = _mm_packs_epi32(mod, mod);
2779 pix = _mm_mulhi_epu16(pix, mod);
2780 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2785 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2789 int startx = span->startx;
2790 int endx = span->endx;
2792 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2793 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2794 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2795 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
2796 data = _mm_mul_ps(data, _mm_set1_ps(255.0f));
2797 slope = _mm_mul_ps(slope, _mm_set1_ps(255.0f));
2798 for (x = startx;x+2 <= endx;x += 2, data = _mm_add_ps(data, slope))
2800 __m128i pix = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x]))), pix2;
2801 data = _mm_add_ps(data, slope);
2802 pix2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x+1])));
2803 pix = _mm_unpacklo_epi64(_mm_packs_epi32(pix, pix), _mm_packs_epi32(pix2, pix2));
2804 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2806 for (;x < endx;x++, data = _mm_add_ps(data, slope))
2808 __m128i pix = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x])));
2809 pix = _mm_packs_epi32(pix, pix);
2810 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2815 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
2818 int x, startx = span->startx, endx = span->endx;
2819 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
2820 localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
2821 for (x = startx;x+2 <= endx;x+=2)
2823 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2824 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2825 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2826 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2830 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2831 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
2832 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2833 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2838 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
2841 int x, startx = span->startx, endx = span->endx;
2842 for (x = startx;x+2 <= endx;x+=2)
2844 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2845 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
2846 pix1 = _mm_mulhi_epu16(pix1, pix2);
2847 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2851 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2852 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
2853 pix1 = _mm_mulhi_epu16(pix1, pix2);
2854 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2859 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
2862 int x, startx = span->startx, endx = span->endx;
2863 for (x = startx;x+2 <= endx;x+=2)
2865 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2866 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2867 pix1 = _mm_add_epi16(pix1, pix2);
2868 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2872 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2873 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
2874 pix1 = _mm_add_epi16(pix1, pix2);
2875 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2880 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
2883 int x, startx = span->startx, endx = span->endx;
2884 __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
2885 tint = _mm_shuffle_epi32(_mm_packs_epi32(tint, tint), _MM_SHUFFLE(1, 0, 1, 0));
2886 for (x = startx;x+2 <= endx;x+=2)
2888 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2889 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
2890 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
2891 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2895 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2896 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
2897 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
2898 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2903 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
2906 int x, startx = span->startx, endx = span->endx;
2907 for (x = startx;x+2 <= endx;x+=2)
2909 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2910 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2911 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2912 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
2913 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2917 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2918 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
2919 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
2920 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
2921 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2926 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
2929 int x, startx = span->startx, endx = span->endx;
2930 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
2931 localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
2932 blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
2933 for (x = startx;x+2 <= endx;x+=2)
2935 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
2936 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
2937 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2941 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
2942 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
2943 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2950 void DPSOFTRAST_VertexShader_Generic(void)
2952 DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
2953 DPSOFTRAST_Array_Copy(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_COLOR], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_COLOR], dpsoftrast.numvertices);
2954 DPSOFTRAST_Array_Copy(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.numvertices);
2955 if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
2956 DPSOFTRAST_Array_Copy(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1], dpsoftrast.numvertices);
2959 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
2961 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
2962 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
2963 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
2964 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
2965 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
2966 if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
2968 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
2969 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
2970 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
2972 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
2973 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
2976 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
2978 else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
2981 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
2983 else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
2986 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
2991 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
2992 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
2997 void DPSOFTRAST_VertexShader_PostProcess(void)
2999 DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3000 DPSOFTRAST_Array_Copy(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.numvertices);
3001 DPSOFTRAST_Array_Copy(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1], dpsoftrast.numvertices);
3004 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3006 // TODO: optimize!! at the very least there is no reason to use texture sampling on the frame texture
3007 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3008 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3009 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3010 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3011 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3012 if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3014 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3015 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3017 DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3018 if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3020 // TODO: implement saturation
3022 if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3024 // TODO: implement gammaramps
3026 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3031 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3033 DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3036 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3038 // this is never called (because colormask is off when this shader is used)
3039 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3040 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3041 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3042 memset(buffer_FragColorbgra8, 0, span->length*4);
3043 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3048 void DPSOFTRAST_VertexShader_FlatColor(void)
3050 DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3051 DPSOFTRAST_Array_Transform(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3054 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3056 int x, startx = span->startx, endx = span->endx;
3057 int Color_Ambienti[4];
3058 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3059 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3060 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3061 Color_Ambienti[2] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0]*256.0f);
3062 Color_Ambienti[1] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1]*256.0f);
3063 Color_Ambienti[0] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2]*256.0f);
3064 Color_Ambienti[3] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0] *256.0f);
3065 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3066 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3067 for (x = startx;x < endx;x++)
3069 buffer_FragColorbgra8[x*4+0] = (buffer_texture_colorbgra8[x*4+0] * Color_Ambienti[0])>>8;
3070 buffer_FragColorbgra8[x*4+1] = (buffer_texture_colorbgra8[x*4+1] * Color_Ambienti[1])>>8;
3071 buffer_FragColorbgra8[x*4+2] = (buffer_texture_colorbgra8[x*4+2] * Color_Ambienti[2])>>8;
3072 buffer_FragColorbgra8[x*4+3] = (buffer_texture_colorbgra8[x*4+3] * Color_Ambienti[3])>>8;
3074 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3079 void DPSOFTRAST_VertexShader_VertexColor(void)
3081 DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3082 DPSOFTRAST_Array_Copy(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_COLOR], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_COLOR], dpsoftrast.numvertices);
3083 DPSOFTRAST_Array_Transform(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3086 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3089 unsigned char * RESTRICT pixelmask = span->pixelmask;
3090 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3091 int x, startx = span->startx, endx = span->endx;
3092 __m128i Color_Ambientm, Color_Diffusem;
3094 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3095 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3096 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3097 int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3098 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3099 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3100 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3101 pixel = buffer_FragColorbgra8;
3102 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3103 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3104 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3105 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3106 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3107 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3108 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3109 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3110 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3111 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3112 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3113 data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3114 slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3115 for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3117 __m128i color, mod, pix;
3118 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3121 __m128 z = _mm_loadu_ps(&buffer_z[x]);
3122 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3123 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3124 data = _mm_add_ps(data, slope);
3125 mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3126 data = _mm_add_ps(data, slope);
3127 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3128 data = _mm_add_ps(data, slope);
3129 mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3130 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3131 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3132 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3133 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3134 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3140 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3141 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x])));
3142 mod = _mm_packs_epi32(mod, mod);
3143 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3144 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3146 if(pixel == buffer_FragColorbgra8)
3147 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3153 void DPSOFTRAST_VertexShader_Lightmap(void)
3155 DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3156 DPSOFTRAST_Array_Transform(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3157 DPSOFTRAST_Array_Copy(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD4], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD4], dpsoftrast.numvertices);
3160 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3163 unsigned char * RESTRICT pixelmask = span->pixelmask;
3164 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3165 int x, startx = span->startx, endx = span->endx;
3166 __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3167 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3168 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3169 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3170 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3171 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3172 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3173 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3174 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3175 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3176 pixel = buffer_FragColorbgra8;
3177 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3178 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3179 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3180 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3181 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3182 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3183 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3184 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3186 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3187 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3188 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3189 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3190 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3191 for (x = startx;x < endx;x++)
3193 __m128i color, lightmap, glow, pix;
3194 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3197 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3198 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3199 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3200 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3201 _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3202 _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3203 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3204 _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3205 _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3206 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3212 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3213 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3214 glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3215 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3216 pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3217 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3222 for (x = startx;x < endx;x++)
3224 __m128i color, lightmap, pix;
3225 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3228 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3229 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3230 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3231 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3232 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3233 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3234 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3240 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3241 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3242 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3243 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3246 if(pixel == buffer_FragColorbgra8)
3247 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3253 void DPSOFTRAST_VertexShader_FakeLight(void)
3255 DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3258 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3261 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3262 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3263 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3264 memset(buffer_FragColorbgra8, 0, span->length*4);
3265 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3270 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3272 DPSOFTRAST_VertexShader_Lightmap();
3275 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3277 DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3283 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3285 DPSOFTRAST_VertexShader_Lightmap();
3288 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3290 DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3296 void DPSOFTRAST_VertexShader_LightDirection(void)
3299 int numvertices = dpsoftrast.numvertices;
3301 float LightVector[4];
3302 float EyePosition[4];
3303 float EyeVectorModelSpace[4];
3309 LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3310 LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3311 LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3312 LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3313 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3314 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3315 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3316 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3317 DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3318 DPSOFTRAST_Array_Transform(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3319 for (i = 0;i < numvertices;i++)
3321 position[0] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3322 position[1] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3323 position[2] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3324 svector[0] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3325 svector[1] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3326 svector[2] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3327 tvector[0] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3328 tvector[1] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3329 tvector[2] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3330 normal[0] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3331 normal[1] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3332 normal[2] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3333 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3334 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3335 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3336 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3337 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3338 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3339 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3340 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3341 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3342 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3343 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3344 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3345 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3346 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3347 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3348 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3349 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3353 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3354 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3355 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3356 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3357 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3358 #define DPSOFTRAST_Vector3Normalize(v)\
3361 float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3372 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3374 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3375 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3376 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3377 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3378 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3379 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3380 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3381 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3382 int x, startx = span->startx, endx = span->endx;
3383 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3384 float LightVectordata[4];
3385 float LightVectorslope[4];
3386 float EyeVectordata[4];
3387 float EyeVectorslope[4];
3389 float diffusetex[4];
3391 float surfacenormal[4];
3392 float lightnormal[4];
3394 float specularnormal[4];
3397 float SpecularPower;
3399 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3400 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3401 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3402 Color_Glow[3] = 0.0f;
3403 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3404 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3405 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3406 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3407 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3408 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3409 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3410 Color_Pants[3] = 0.0f;
3411 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3412 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3413 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3414 Color_Shirt[3] = 0.0f;
3415 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3416 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3417 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3419 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3420 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3422 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3424 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3426 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3428 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3429 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3430 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3431 Color_Diffuse[3] = 0.0f;
3432 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3433 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3434 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3435 LightColor[3] = 0.0f;
3436 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3437 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3438 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3439 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3440 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3441 Color_Specular[3] = 0.0f;
3442 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3443 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3444 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3445 for (x = startx;x < endx;x++)
3448 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3449 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3450 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3451 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3452 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3454 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3455 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3456 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3457 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3459 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3460 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3461 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3462 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3463 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3464 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3465 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3466 DPSOFTRAST_Vector3Normalize(surfacenormal);
3468 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3469 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3470 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3471 DPSOFTRAST_Vector3Normalize(lightnormal);
3473 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3474 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3475 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3476 DPSOFTRAST_Vector3Normalize(eyenormal);
3478 specularnormal[0] = lightnormal[0] + eyenormal[0];
3479 specularnormal[1] = lightnormal[1] + eyenormal[1];
3480 specularnormal[2] = lightnormal[2] + eyenormal[2];
3481 DPSOFTRAST_Vector3Normalize(specularnormal);
3483 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3484 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3485 specular = pow(specular, SpecularPower * glosstex[3]);
3486 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3488 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3489 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3490 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3491 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3495 d[0] = (int)( diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3496 d[1] = (int)( diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3497 d[2] = (int)( diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3498 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3500 buffer_FragColorbgra8[x*4+0] = d[0];
3501 buffer_FragColorbgra8[x*4+1] = d[1];
3502 buffer_FragColorbgra8[x*4+2] = d[2];
3503 buffer_FragColorbgra8[x*4+3] = d[3];
3506 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3508 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3509 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3510 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3511 Color_Diffuse[3] = 0.0f;
3512 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3513 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3514 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3515 LightColor[3] = 0.0f;
3516 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3517 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3518 for (x = startx;x < endx;x++)
3521 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3522 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3523 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3524 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3525 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3526 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3527 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3528 DPSOFTRAST_Vector3Normalize(surfacenormal);
3530 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3531 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3532 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3533 DPSOFTRAST_Vector3Normalize(lightnormal);
3535 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3536 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3538 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3539 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3540 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3541 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3545 d[0] = (int)( + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3546 d[1] = (int)( + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3547 d[2] = (int)( + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3548 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3550 buffer_FragColorbgra8[x*4+0] = d[0];
3551 buffer_FragColorbgra8[x*4+1] = d[1];
3552 buffer_FragColorbgra8[x*4+2] = d[2];
3553 buffer_FragColorbgra8[x*4+3] = d[3];
3558 for (x = startx;x < endx;x++)
3561 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3562 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3563 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3564 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3566 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3568 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3569 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3570 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3571 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3575 d[0] = (int)( diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3576 d[1] = (int)( diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3577 d[2] = (int)( diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3578 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3580 buffer_FragColorbgra8[x*4+0] = d[0];
3581 buffer_FragColorbgra8[x*4+1] = d[1];
3582 buffer_FragColorbgra8[x*4+2] = d[2];
3583 buffer_FragColorbgra8[x*4+3] = d[3];
3586 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3591 void DPSOFTRAST_VertexShader_LightSource(void)
3594 int numvertices = dpsoftrast.numvertices;
3595 float LightPosition[4];
3596 float LightVector[4];
3597 float LightVectorModelSpace[4];
3598 float EyePosition[4];
3599 float EyeVectorModelSpace[4];
3605 LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
3606 LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
3607 LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
3608 LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
3609 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3610 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3611 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3612 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3613 DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3614 DPSOFTRAST_Array_Transform(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3615 DPSOFTRAST_Array_Transform(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
3616 DPSOFTRAST_Array_Copy(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD4], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD4], dpsoftrast.numvertices);
3617 for (i = 0;i < numvertices;i++)
3619 position[0] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3620 position[1] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3621 position[2] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3622 svector[0] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3623 svector[1] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3624 svector[2] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3625 tvector[0] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3626 tvector[1] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3627 tvector[2] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3628 normal[0] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3629 normal[1] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3630 normal[2] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3631 LightVectorModelSpace[0] = LightPosition[0] - position[0];
3632 LightVectorModelSpace[1] = LightPosition[1] - position[1];
3633 LightVectorModelSpace[2] = LightPosition[2] - position[2];
3634 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
3635 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
3636 LightVector[2] = normal[0] * LightVectorModelSpace[0] + normal[1] * LightVectorModelSpace[1] + normal[2] * LightVectorModelSpace[2];
3637 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3638 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3639 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3640 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3641 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3642 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3643 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3644 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3645 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3646 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3647 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3648 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3649 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3650 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3654 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3656 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3657 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3658 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3659 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3660 unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3661 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3662 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3663 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3664 int x, startx = span->startx, endx = span->endx;
3665 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3666 float CubeVectordata[4];
3667 float CubeVectorslope[4];
3668 float LightVectordata[4];
3669 float LightVectorslope[4];
3670 float EyeVectordata[4];
3671 float EyeVectorslope[4];
3673 float diffusetex[4];
3675 float surfacenormal[4];
3676 float lightnormal[4];
3678 float specularnormal[4];
3681 float SpecularPower;
3682 float CubeVector[4];
3685 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3686 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3687 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3688 Color_Glow[3] = 0.0f;
3689 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3690 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3691 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3692 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3693 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3694 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3695 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3696 Color_Diffuse[3] = 0.0f;
3697 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3698 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3699 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3700 Color_Specular[3] = 0.0f;
3701 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3702 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3703 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3704 Color_Pants[3] = 0.0f;
3705 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3706 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3707 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3708 Color_Shirt[3] = 0.0f;
3709 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3710 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3711 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3712 LightColor[3] = 0.0f;
3713 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3714 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3715 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3716 DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3717 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3718 memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
3719 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3720 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3722 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3723 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3725 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3726 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
3727 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3729 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3730 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3731 for (x = startx;x < endx;x++)
3734 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3735 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3736 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3737 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3738 if (attenuation < 0.01f)
3740 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3742 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3743 if (attenuation < 0.01f)
3747 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3748 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3749 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3750 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3751 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3753 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3754 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3755 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3756 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3758 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3759 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3760 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3761 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3762 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3763 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3764 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3765 DPSOFTRAST_Vector3Normalize(surfacenormal);
3767 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3768 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3769 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3770 DPSOFTRAST_Vector3Normalize(lightnormal);
3772 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3773 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3774 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3775 DPSOFTRAST_Vector3Normalize(eyenormal);
3777 specularnormal[0] = lightnormal[0] + eyenormal[0];
3778 specularnormal[1] = lightnormal[1] + eyenormal[1];
3779 specularnormal[2] = lightnormal[2] + eyenormal[2];
3780 DPSOFTRAST_Vector3Normalize(specularnormal);
3782 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3783 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3784 specular = pow(specular, SpecularPower * glosstex[3]);
3785 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3787 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3788 attenuation *= (1.0f / 255.0f);
3789 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3790 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3791 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3792 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
3796 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
3797 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
3798 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
3799 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
3801 buffer_FragColorbgra8[x*4+0] = d[0];
3802 buffer_FragColorbgra8[x*4+1] = d[1];
3803 buffer_FragColorbgra8[x*4+2] = d[2];
3804 buffer_FragColorbgra8[x*4+3] = d[3];
3807 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3809 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3810 for (x = startx;x < endx;x++)
3813 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3814 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3815 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3816 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3817 if (attenuation < 0.01f)
3819 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3821 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3822 if (attenuation < 0.01f)
3826 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3827 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3828 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3829 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3830 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3832 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3833 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3834 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3835 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3837 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3838 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3839 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3840 DPSOFTRAST_Vector3Normalize(surfacenormal);
3842 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3843 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3844 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3845 DPSOFTRAST_Vector3Normalize(lightnormal);
3847 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3848 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3850 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3851 attenuation *= (1.0f / 255.0f);
3852 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3853 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3854 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3855 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
3859 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
3860 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
3861 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
3862 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
3864 buffer_FragColorbgra8[x*4+0] = d[0];
3865 buffer_FragColorbgra8[x*4+1] = d[1];
3866 buffer_FragColorbgra8[x*4+2] = d[2];
3867 buffer_FragColorbgra8[x*4+3] = d[3];
3872 for (x = startx;x < endx;x++)
3875 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3876 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3877 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3878 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3879 if (attenuation < 0.01f)
3881 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3883 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3884 if (attenuation < 0.01f)
3888 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3889 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3890 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3891 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3892 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3894 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3895 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3896 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3897 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3899 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3901 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3902 attenuation *= (1.0f / 255.0f);
3903 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3904 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3905 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3906 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
3910 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
3911 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
3912 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
3913 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
3915 buffer_FragColorbgra8[x*4+0] = d[0];
3916 buffer_FragColorbgra8[x*4+1] = d[1];
3917 buffer_FragColorbgra8[x*4+2] = d[2];
3918 buffer_FragColorbgra8[x*4+3] = d[3];
3921 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3926 void DPSOFTRAST_VertexShader_Refraction(void)
3928 DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3931 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3934 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3935 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3936 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3937 memset(buffer_FragColorbgra8, 0, span->length*4);
3938 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3943 void DPSOFTRAST_VertexShader_Water(void)
3945 DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3949 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3952 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3953 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3954 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3955 memset(buffer_FragColorbgra8, 0, span->length*4);
3956 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3961 void DPSOFTRAST_VertexShader_ShowDepth(void)
3963 DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3966 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3969 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3970 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3971 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3972 memset(buffer_FragColorbgra8, 0, span->length*4);
3973 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3978 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
3980 DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3983 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3986 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3987 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3988 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3989 memset(buffer_FragColorbgra8, 0, span->length*4);
3990 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3995 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
3997 DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4000 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4003 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4004 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4005 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4006 memset(buffer_FragColorbgra8, 0, span->length*4);
4007 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4012 typedef struct DPSOFTRAST_ShaderModeInfo_s
4015 void (*Vertex)(void);
4016 void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4017 unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4018 unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4020 DPSOFTRAST_ShaderModeInfo;
4022 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4024 {2, DPSOFTRAST_VertexShader_Generic, DPSOFTRAST_PixelShader_Generic, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4025 {2, DPSOFTRAST_VertexShader_PostProcess, DPSOFTRAST_PixelShader_PostProcess, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4026 {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow, DPSOFTRAST_PixelShader_Depth_Or_Shadow, {~0}, {~0}},
4027 {2, DPSOFTRAST_VertexShader_FlatColor, DPSOFTRAST_PixelShader_FlatColor, {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4028 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4029 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4030 {2, DPSOFTRAST_VertexShader_FakeLight, DPSOFTRAST_PixelShader_FakeLight, {~0}, {~0}},
4031 {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace, DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4032 {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4033 {2, DPSOFTRAST_VertexShader_LightDirection, DPSOFTRAST_PixelShader_LightDirection, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4034 {2, DPSOFTRAST_VertexShader_LightSource, DPSOFTRAST_PixelShader_LightSource, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4035 {2, DPSOFTRAST_VertexShader_Refraction, DPSOFTRAST_PixelShader_Refraction, {~0}},
4036 {2, DPSOFTRAST_VertexShader_Water, DPSOFTRAST_PixelShader_Water, {~0}},
4037 {2, DPSOFTRAST_VertexShader_ShowDepth, DPSOFTRAST_PixelShader_ShowDepth, {~0}},
4038 {2, DPSOFTRAST_VertexShader_DeferredGeometry, DPSOFTRAST_PixelShader_DeferredGeometry, {~0}},
4039 {2, DPSOFTRAST_VertexShader_DeferredLightSource, DPSOFTRAST_PixelShader_DeferredLightSource, {~0}}
4043 int DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int commandoffset, int endoffset)
4045 while (commandoffset != endoffset)
4047 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
4048 switch (command->opcode)
4050 #define INTERPCOMMAND(name) \
4051 case DPSOFTRAST_OPCODE_##name : \
4052 DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
4053 commandoffset += sizeof( DPSOFTRAST_Command_##name ) + ((COMMAND_SIZE - (sizeof( DPSOFTRAST_Command_##name )&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)); \
4054 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
4055 commandoffset = 0; \
4057 INTERPCOMMAND(Viewport)
4058 INTERPCOMMAND(ClearColor)
4059 INTERPCOMMAND(ClearDepth)
4060 INTERPCOMMAND(ColorMask)
4061 INTERPCOMMAND(DepthTest)
4062 INTERPCOMMAND(ScissorTest)
4063 INTERPCOMMAND(Scissor)
4064 INTERPCOMMAND(BlendFunc)
4065 INTERPCOMMAND(BlendSubtract)
4066 INTERPCOMMAND(DepthMask)
4067 INTERPCOMMAND(DepthFunc)
4068 INTERPCOMMAND(DepthRange)
4069 INTERPCOMMAND(PolygonOffset)
4070 INTERPCOMMAND(AlphaTest)
4071 INTERPCOMMAND(AlphaFunc)
4072 INTERPCOMMAND(SetTexture)
4073 INTERPCOMMAND(SetShader)
4074 INTERPCOMMAND(Uniform4f)
4075 INTERPCOMMAND(UniformMatrix4f)
4076 INTERPCOMMAND(Uniform1i)
4078 case DPSOFTRAST_OPCODE_Reset:
4083 return commandoffset;
4086 int DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread, int commandoffset)
4093 // unsigned int *colorpixel;
4094 unsigned int *depthpixel;
4100 DPSOFTRAST_State_Triangle *triangle;
4101 DPSOFTRAST_State_Span *span;
4102 unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4103 for (i = 0; i < thread->numspans; i++)
4105 span = &thread->spans[i];
4106 triangle = &dpsoftrast.trianglepool.triangles[span->triangle];
4107 if (commandoffset != triangle->commandoffset)
4109 commandoffset = DPSOFTRAST_Draw_InterpretCommands(thread, commandoffset, triangle->commandoffset);
4110 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4112 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4114 wslope = triangle->w[0];
4115 w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4116 depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4117 depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4118 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4119 switch(thread->fb_depthfunc)
4122 case GL_ALWAYS: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = true; break;
4123 case GL_LESS: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4124 case GL_LEQUAL: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4125 case GL_EQUAL: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4126 case GL_GEQUAL: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4127 case GL_GREATER: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4128 case GL_NEVER: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = false; break;
4130 //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4131 //for (x = 0;x < span->length;x++)
4132 // colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4133 // if there is no color buffer, skip pixel shader
4135 endx = span->length;
4136 while (startx < endx && !pixelmask[startx])
4138 while (endx > startx && !pixelmask[endx-1])
4141 continue; // no pixels to fill
4142 span->pixelmask = pixelmask;
4143 span->startx = startx;
4145 // run pixel shader if appropriate
4146 // do this before running depthmask code, to allow the pixelshader
4147 // to clear pixelmask values for alpha testing
4148 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4149 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4150 if (thread->depthmask)
4151 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4157 // no depth testing means we're just dealing with color...
4158 // if there is no color buffer, skip pixel shader
4159 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4161 memset(pixelmask, 1, span->length);
4162 span->pixelmask = pixelmask;
4164 span->endx = span->length;
4165 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4169 thread->numspans = 0;
4170 return commandoffset;
4173 void DPSOFTRAST_Draw_GenerateSpans(DPSOFTRAST_State_Thread *thread, int freetriangle)
4175 int miny = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
4176 int maxy = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
4177 int commandoffset = thread->commandoffset;
4178 int triangleoffset = thread->triangleoffset;
4179 DPSOFTRAST_State_Triangle *triangle = NULL;
4186 while (triangleoffset != freetriangle)
4188 triangle = &dpsoftrast.trianglepool.triangles[triangleoffset];
4189 if (++triangleoffset >= DPSOFTRAST_DRAW_MAXTRIANGLEPOOL)
4191 starty = triangle->starty + 1;
4192 endy = triangle->endy;
4193 if (starty >= maxy || endy <= miny)
4195 numpoints = triangle->numpoints;
4196 coords[0] = _mm_load_ps(triangle->coords[0]);
4197 coords[1] = _mm_load_ps(triangle->coords[1]);
4198 coords[2] = _mm_load_ps(triangle->coords[2]);
4199 coords[3] = _mm_load_ps(triangle->coords[3]);
4200 ycoords = _mm_load_si128((const __m128i *)triangle->ycoords);
4205 for (y = starty; y < endy;)
4207 __m128 xcoords, xslope;
4208 __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), ycoords);
4209 int yccmask = _mm_movemask_epi8(ycc);
4210 int edge0p, edge0n, edge1p, edge1n;
4217 case 0xFFFF: /*0000*/ y = endy; continue;
4218 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
4219 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4220 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
4221 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
4222 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
4223 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
4224 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
4225 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
4226 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
4227 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
4228 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
4229 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
4230 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4231 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
4232 case 0x0000: /*1111*/ y++; continue;
4240 case 0xFFFF: /*000*/ y = endy; continue;
4241 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
4242 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4243 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
4244 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
4245 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4246 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
4247 case 0x0000: /*111*/ y++; continue;
4250 ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), ycoords);
4251 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
4252 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
4253 nexty = _mm_extract_epi16(ycc, 0);
4254 if(nexty >= endy) nexty = endy-1;
4255 if (_mm_ucomigt_ss(_mm_max_ss(coords[edge0n], coords[edge0p]), _mm_min_ss(coords[edge1n], coords[edge1p])))
4264 xslope = _mm_sub_ps(_mm_movelh_ps(coords[edge0n], coords[edge1n]), _mm_movelh_ps(coords[edge0p], coords[edge1p]));
4265 xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
4266 xcoords = _mm_add_ps(_mm_movelh_ps(coords[edge0p], coords[edge1p]),
4267 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(coords[edge0p], coords[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
4268 xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
4269 for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
4271 int startx, endx, offset;
4272 startx = _mm_cvtss_si32(xcoords);
4273 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
4274 if (startx < 0) startx = 0;
4275 if (endx > dpsoftrast.fb_width) endx = dpsoftrast.fb_width;
4276 if (startx >= endx) continue;
4277 for (offset = startx; offset < endx;)
4279 DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
4280 span->triangle = (int)(triangle - dpsoftrast.trianglepool.triangles);
4283 span->length = endx - offset;
4284 if (span -> length > DPSOFTRAST_DRAW_MAXSPANLENGTH)
4285 span -> length = DPSOFTRAST_DRAW_MAXSPANLENGTH;
4286 offset += span->length;
4287 if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
4288 commandoffset = DPSOFTRAST_Draw_ProcessSpans(thread, commandoffset);
4294 if (thread->numspans > 0)
4295 commandoffset = DPSOFTRAST_Draw_ProcessSpans(thread, commandoffset);
4296 if (commandoffset != triangle->commandoffset)
4298 commandoffset = DPSOFTRAST_Draw_InterpretCommands(thread, commandoffset, triangle->commandoffset);
4299 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4304 thread->commandoffset = commandoffset;
4305 thread->triangleoffset = triangleoffset;
4308 void DPSOFTRAST_Draw_FlushThreads(void)
4310 DPSOFTRAST_State_Thread *thread;
4312 if(dpsoftrast.drawtriangle != dpsoftrast.trianglepool.freetriangle)
4315 dpsoftrast.drawtriangle = dpsoftrast.trianglepool.freetriangle;
4318 SDL_LockMutex(dpsoftrast.trianglemutex);
4320 for (i = 0; i < dpsoftrast.numthreads; i++)
4322 thread = &dpsoftrast.threads[i];
4324 while (thread->triangleoffset != dpsoftrast.drawtriangle)
4326 thread->waiting = true;
4327 SDL_CondBroadcast(dpsoftrast.trianglecond);
4328 SDL_CondWait(thread->waitcond, dpsoftrast.trianglemutex);
4329 thread->waiting = false;
4332 if (thread->triangleoffset != dpsoftrast.drawtriangle)
4333 DPSOFTRAST_Draw_GenerateSpans(thread, dpsoftrast.drawtriangle);
4337 SDL_UnlockMutex(dpsoftrast.trianglemutex);
4339 dpsoftrast.trianglepool.usedtriangles = 0;
4340 dpsoftrast.commandpool.usedcommands = 0;
4344 static int DPSOFTRAST_Draw_Thread(void *data)
4346 DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
4347 while(thread->index >= 0)
4349 if (thread->triangleoffset != dpsoftrast.drawtriangle)
4351 DPSOFTRAST_Draw_GenerateSpans(thread, dpsoftrast.drawtriangle);
4355 SDL_LockMutex(dpsoftrast.trianglemutex);
4356 if (thread->triangleoffset != dpsoftrast.drawtriangle)
4358 SDL_UnlockMutex(dpsoftrast.trianglemutex);
4361 if (thread->waiting) SDL_CondSignal(thread->waitcond);
4362 SDL_CondWait(dpsoftrast.trianglecond, dpsoftrast.trianglemutex);
4363 SDL_UnlockMutex(dpsoftrast.trianglemutex);
4370 void DPSOFTRAST_Draw_ProcessTriangles(int firstvertex, int numtriangles, const int *element3i, const unsigned short *element3s, unsigned char *arraymask, int numarrays)
4373 int cullface = dpsoftrast.cullface;
4374 int width = dpsoftrast.fb_width;
4375 int height = dpsoftrast.fb_height;
4376 __m128i fbmax = _mm_sub_epi16(_mm_setr_epi16(width, height, width, height, width, height, width, height), _mm_set1_epi16(1));
4377 DPSOFTRAST_State_Triangle *triangle;
4389 __m128 triangleedge1, triangleedge2, trianglenormal;
4392 DPSOFTRAST_Texture *texture;
4393 screen[3] = _mm_setzero_ps();
4394 clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4395 for (i = 0;i < numtriangles;i++)
4397 // generate the 3 edges of this triangle
4398 // generate spans for the triangle - switch based on left split or right split classification of triangle
4401 e[0] = element3i[i*3+0] - firstvertex;
4402 e[1] = element3i[i*3+1] - firstvertex;
4403 e[2] = element3i[i*3+2] - firstvertex;
4407 e[0] = element3s[i*3+0] - firstvertex;
4408 e[1] = element3s[i*3+1] - firstvertex;
4409 e[2] = element3s[i*3+2] - firstvertex;
4418 #define SKIPBACKFACE \
4419 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4420 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4421 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4422 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4423 _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4427 if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4431 if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4435 //trianglenormal = _mm_sub_ps(_mm_mul_ps(triangleedge[0], _mm_shuffle_ps(triangleedge[1], triangleedge[1], _MM_SHUFFLE(3, 0, 2, 1))),
4436 // _mm_mul_ps(_mm_shuffle_ps(triangleedge[0], triangleedge[0], _MM_SHUFFLE(3, 0, 2, 1)), triangleedge[1]));
4437 //trianglenormal[2] = triangleedge[0][0] * triangleedge[1][1] - triangleedge[0][1] * triangleedge[1][0];
4438 //trianglenormal[0] = triangleedge[0][1] * triangleedge[1][2] - triangleedge[0][2] * triangleedge[1][1];
4439 //trianglenormal[1] = triangleedge[0][2] * triangleedge[1][0] - triangleedge[0][0] * triangleedge[1][2];
4441 // macros for clipping vertices
4443 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4444 clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4446 __m128 v1 = _mm_load_ps(&dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[p1]*4]), v2 = _mm_load_ps(&dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[p2]*4]); \
4447 screen[k] = DPSOFTRAST_Draw_ProjectVertex(_mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1]))); \
4449 #define CLIPPEDVERTEXCOPY(k,p1) \
4450 screen[k] = _mm_load_ps(&dpsoftrast.screencoord4f[e[p1]*4]);
4452 #define GENATTRIBCOPY(j, attrib, p1) \
4453 attrib = _mm_load_ps(&dpsoftrast.post_array4f[j][e[p1]*4]);
4454 #define GENATTRIBLERP(j, attrib, p1, p2) \
4456 __m128 v1 = _mm_load_ps(&dpsoftrast.post_array4f[j][e[p1]*4]), v2 = _mm_load_ps(&dpsoftrast.post_array4f[j][e[p2]*4]); \
4457 attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4459 #define GENATTRIBS(j, attrib0, attrib1, attrib2) \
4463 case 0: GENATTRIBCOPY(j, attrib0, 0); GENATTRIBCOPY(j, attrib1, 1); GENATTRIBCOPY(j, attrib2, 2); break; \
4464 case 1: GENATTRIBCOPY(j, attrib0, 0); GENATTRIBCOPY(j, attrib1, 1); GENATTRIBLERP(j, attrib2, 1, 2); break; \
4465 case 2: GENATTRIBCOPY(j, attrib0, 0); GENATTRIBLERP(j, attrib1, 0, 1); GENATTRIBLERP(j, attrib2, 1, 2); break; \
4466 case 3: GENATTRIBCOPY(j, attrib0, 0); GENATTRIBLERP(j, attrib1, 0, 1); GENATTRIBLERP(j, attrib2, 2, 0); break; \
4467 case 4: GENATTRIBLERP(j, attrib0, 0, 1); GENATTRIBCOPY(j, attrib1, 1); GENATTRIBCOPY(j, attrib2, 2); break; \
4468 case 5: GENATTRIBLERP(j, attrib0, 0, 1); GENATTRIBCOPY(j, attrib1, 1); GENATTRIBLERP(j, attrib2, 1, 2); break; \
4469 case 6: GENATTRIBLERP(j, attrib0, 1, 2); GENATTRIBCOPY(j, attrib1, 2); GENATTRIBLERP(j, attrib2, 2, 0); break; \
4472 // calculate distance from nearplane
4473 clipdist[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[0]*4+2] + dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[0]*4+3];
4474 clipdist[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[1]*4+2] + dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[1]*4+3];
4475 clipdist[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[2]*4+2] + dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[2]*4+3];
4476 if (clipdist[0] >= 0.0f)
4478 if (clipdist[1] >= 0.0f)
4480 if (clipdist[2] >= 0.0f)
4482 // triangle is entirely in front of nearplane
4483 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4490 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4498 if (clipdist[2] >= 0.0f)
4500 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4507 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4514 else if (clipdist[1] >= 0.0f)
4516 if (clipdist[2] >= 0.0f)
4518 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4525 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4531 else if (clipdist[2] >= 0.0f)
4533 CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4538 else continue; // triangle is entirely behind nearplane
4541 // calculate integer y coords for triangle points
4542 __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4543 screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4544 screenmin = _mm_min_epi16(screeni, screenir),
4545 screenmax = _mm_max_epi16(screeni, screenir);
4546 screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4547 screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4548 screenmin = _mm_max_epi16(screenmin, _mm_setzero_si128());
4549 screenmax = _mm_min_epi16(screenmax, fbmax);
4550 // skip offscreen triangles
4551 if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4553 starty = _mm_extract_epi16(screenmin, 1);
4554 endy = _mm_extract_epi16(screenmax, 1)+1;
4555 screeny = _mm_srai_epi32(screeni, 16);
4558 if (dpsoftrast.trianglepool.usedtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLEPOOL-1)
4560 DPSOFTRAST_Draw_FreeTrianglePool(DPSOFTRAST_DRAW_MAXTRIANGLEPOOL/8);
4562 DPSOFTRAST_Draw_FlushThreads();
4565 triangle = &dpsoftrast.trianglepool.triangles[dpsoftrast.trianglepool.freetriangle];
4566 triangle->commandoffset = dpsoftrast.commandpool.freecommand;
4567 triangle->starty = starty;
4568 triangle->endy = endy;
4569 triangle->numpoints = numpoints;
4570 _mm_store_ps(triangle->coords[0], screen[0]);
4571 _mm_store_ps(triangle->coords[1], screen[1]);
4572 _mm_store_ps(triangle->coords[2], screen[2]);
4573 _mm_store_ps(triangle->coords[3], numpoints > 3 ? screen[3] : screen[2]);
4574 _mm_store_si128((__m128i *)triangle->ycoords, screeny);
4576 // calculate attribute plans for triangle data...
4577 // okay, this triangle is going to produce spans, we'd better project
4578 // the interpolants now (this is what gives perspective texturing),
4579 // this consists of simply multiplying all arrays by the W coord
4580 // (which is basically 1/Z), which will be undone per-pixel
4581 // (multiplying by Z again) to get the perspective-correct array
4584 __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4585 attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4586 attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4587 attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4588 attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4589 attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4590 w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4591 w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4592 w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4593 attribedge1 = _mm_sub_ss(w0, w1);
4594 attribedge2 = _mm_sub_ss(w2, w1);
4595 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4596 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4597 x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4598 y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4599 attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4600 _mm_store_ss(&triangle->w[0], attribxslope);
4601 _mm_store_ss(&triangle->w[1], attribyslope);
4602 _mm_store_ss(&triangle->w[2], attriborigin);
4603 for (j = 0;j < numarrays;j++)
4607 __m128 attrib0, attrib1, attrib2;
4608 GENATTRIBS(j, attrib0, attrib1, attrib2);
4609 attriborigin = _mm_mul_ps(attrib1, w1);
4610 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4611 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4612 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4613 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4614 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4615 _mm_stream_ps(triangle->attribs[j][0], attribxslope);
4616 _mm_stream_ps(triangle->attribs[j][1], attribyslope);
4617 _mm_stream_ps(triangle->attribs[j][2], attriborigin);
4622 // adjust texture LOD by texture density, in the simplest way possible...
4624 __m128 mipedgescale, mipedgetc, mipdensity, attrib0, attrib1, attrib2;
4625 memset(triangle->mip, 0, sizeof(triangle->mip));
4626 mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4627 mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4628 mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4629 k = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].lodarrayindex;
4630 GENATTRIBS(k, attrib0, attrib1, attrib2);
4631 mipedgetc = _mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1));
4632 mipedgetc = _mm_mul_ps(mipedgetc, mipedgescale);
4633 for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4635 int texunit = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].texunits[j];
4636 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4638 texture = dpsoftrast.texbound[texunit];
4639 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4641 mipdensity = _mm_mul_ps(mipedgetc, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4642 mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4643 mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4644 mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4645 // this will be multiplied in the texturing routine by the texture resolution
4646 y = _mm_cvtss_si32(mipdensity);
4649 y = (int)(log((float)y)*0.5f/M_LN2);
4650 if (y > texture->mipmaps - 1)
4651 y = texture->mipmaps - 1;
4652 triangle->mip[texunit] = y;
4658 dpsoftrast.trianglepool.freetriangle = dpsoftrast.trianglepool.freetriangle < DPSOFTRAST_DRAW_MAXTRIANGLEPOOL-1 ? dpsoftrast.trianglepool.freetriangle + 1 : 0;
4659 dpsoftrast.trianglepool.usedtriangles++;
4662 if (numqueued >= DPSOFTRAST_DRAW_FLUSHPROCESSTRIANGLES)
4665 dpsoftrast.drawtriangle = dpsoftrast.trianglepool.freetriangle;
4668 SDL_LockMutex(dpsoftrast.trianglemutex);
4669 SDL_CondBroadcast(dpsoftrast.trianglecond);
4670 SDL_UnlockMutex(dpsoftrast.trianglemutex);
4672 DPSOFTRAST_Draw_FlushThreads();
4680 dpsoftrast.drawtriangle = dpsoftrast.trianglepool.freetriangle;
4683 SDL_LockMutex(dpsoftrast.trianglemutex);
4684 SDL_CondBroadcast(dpsoftrast.trianglecond);
4685 SDL_UnlockMutex(dpsoftrast.trianglemutex);
4687 DPSOFTRAST_Draw_FlushThreads();
4693 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4696 int lastarray = DPSOFTRAST_ARRAY_POSITION;
4697 unsigned char arraymask[DPSOFTRAST_ARRAY_TOTAL];
4698 memset(arraymask, false, sizeof(arraymask));
4699 arraymask[DPSOFTRAST_ARRAY_POSITION] = true;
4700 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4702 int arrayindex = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4703 if (arrayindex >= DPSOFTRAST_ARRAY_TOTAL)
4707 case DPSOFTRAST_ARRAY_POSITION:
4708 case DPSOFTRAST_ARRAY_COLOR:
4711 if (dpsoftrast.pointer_texcoordf[arrayindex-DPSOFTRAST_ARRAY_TEXCOORD0] == NULL)
4715 arraymask[arrayindex] = true;
4716 if (arrayindex > lastarray)
4717 lastarray = arrayindex;
4719 DPSOFTRAST_Draw_LoadVertices(firstvertex, numvertices, arraymask[DPSOFTRAST_ARRAY_COLOR]);
4720 DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
4721 // DPSOFTRAST_Draw_ProjectVertices(dpsoftrast.screencoord4f, dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], numvertices);
4722 DPSOFTRAST_Draw_ProcessTriangles(firstvertex, numtriangles, element3i, element3s, arraymask, lastarray+1);
4725 void DPSOFTRAST_Flush(void)
4727 DPSOFTRAST_Draw_SyncCommands();
4728 DPSOFTRAST_Draw_FlushThreads();
4731 void DPSOFTRAST_Init(int width, int height, int numthreads, unsigned int *colorpixels, unsigned int *depthpixels)
4741 memset(&dpsoftrast, 0, sizeof(dpsoftrast));
4742 dpsoftrast.bigendian = u.b[3];
4743 dpsoftrast.fb_width = width;
4744 dpsoftrast.fb_height = height;
4745 dpsoftrast.fb_depthpixels = depthpixels;
4746 dpsoftrast.fb_colorpixels[0] = colorpixels;
4747 dpsoftrast.fb_colorpixels[1] = NULL;
4748 dpsoftrast.fb_colorpixels[1] = NULL;
4749 dpsoftrast.fb_colorpixels[1] = NULL;
4750 dpsoftrast.texture_firstfree = 1;
4751 dpsoftrast.texture_end = 1;
4752 dpsoftrast.texture_max = 0;
4753 dpsoftrast.viewport[0] = 0;
4754 dpsoftrast.viewport[1] = 0;
4755 dpsoftrast.viewport[2] = dpsoftrast.fb_width;
4756 dpsoftrast.viewport[3] = dpsoftrast.fb_height;
4757 dpsoftrast.color[0] = 1;
4758 dpsoftrast.color[1] = 1;
4759 dpsoftrast.color[2] = 1;
4760 dpsoftrast.color[3] = 1;
4761 dpsoftrast.cullface = GL_BACK;
4763 dpsoftrast.numthreads = bound(1, numthreads, 64);
4764 dpsoftrast.trianglemutex = SDL_CreateMutex();
4765 dpsoftrast.trianglecond = SDL_CreateCond();
4767 dpsoftrast.numthreads = 1;
4769 dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
4770 for (i = 0; i < dpsoftrast.numthreads; i++)
4772 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
4774 thread->colormask[1] = 1;
4775 thread->colormask[2] = 1;
4776 thread->colormask[3] = 1;
4777 thread->blendfunc[0] = GL_ONE;
4778 thread->blendfunc[1] = GL_ZERO;
4779 thread->depthmask = true;
4780 thread->depthtest = true;
4781 thread->depthfunc = GL_LEQUAL;
4782 thread->scissortest = false;
4783 thread->alphatest = false;
4784 thread->alphafunc = GL_GREATER;
4785 thread->alphavalue = 0.5f;
4786 thread->scissor[0] = 0;
4787 thread->scissor[1] = 0;
4788 thread->scissor[2] = dpsoftrast.fb_width;
4789 thread->scissor[3] = dpsoftrast.fb_height;
4790 thread->depthrange[0] = 0;
4791 thread->depthrange[1] = 1;
4792 thread->polygonoffset[0] = 0;
4793 thread->polygonoffset[1] = 0;
4795 thread->numspans = 0;
4796 thread->triangleoffset = 0;
4797 thread->commandoffset = 0;
4798 thread->waiting = false;
4800 thread->waitcond = SDL_CreateCond();
4803 thread->validate = -1;
4804 DPSOFTRAST_Validate(thread, -1);
4806 thread->thread = SDL_CreateThread(DPSOFTRAST_Draw_Thread, thread);
4811 void DPSOFTRAST_Shutdown(void)
4815 if(dpsoftrast.numthreads > 0)
4817 DPSOFTRAST_State_Thread *thread;
4818 SDL_LockMutex(dpsoftrast.trianglemutex);
4819 for (i = 0; i < dpsoftrast.numthreads; i++)
4821 thread = &dpsoftrast.threads[i];
4824 SDL_CondBroadcast(dpsoftrast.trianglecond);
4825 SDL_UnlockMutex(dpsoftrast.trianglemutex);
4826 for (i = 0; i < dpsoftrast.numthreads; i++)
4828 thread = &dpsoftrast.threads[i];
4829 SDL_WaitThread(thread->thread, NULL);
4830 SDL_DestroyCond(thread->waitcond);
4832 SDL_DestroyMutex(dpsoftrast.trianglemutex);
4833 SDL_DestroyCond(dpsoftrast.trianglecond);
4836 for (i = 0;i < dpsoftrast.texture_end;i++)
4837 if (dpsoftrast.texture[i].bytes)
4838 MM_FREE(dpsoftrast.texture[i].bytes);
4839 if (dpsoftrast.texture)
4840 free(dpsoftrast.texture);
4841 if (dpsoftrast.threads)
4842 MM_FREE(dpsoftrast.threads);
4843 memset(&dpsoftrast, 0, sizeof(dpsoftrast));