3 #define _USE_MATH_DEFINES
6 #include "dpsoftrast.h"
14 #include <SDL_thread.h>
18 typedef qboolean bool;
22 #define ATOMIC_SIZE 32
25 #define ALIGN(var) var __attribute__((__aligned__(16)))
26 #define ATOMIC(var) var __attribute__((__aligned__(32)))
27 #define MEMORY_BARRIER (_mm_sfence())
28 //(__sync_synchronize())
29 #elif defined(_MSC_VER)
30 #define ALIGN(var) __declspec(align(16)) var
31 #define ATOMIC(var) __declspec(align(32)) var
32 #define MEMORY_BARRIER (_mm_sfence())
35 #define ALIGN(var) var
36 #define ATOMIC(var) var
37 #define MEMORY_BARRIER ((void)0)
40 #if !defined(USE_THREADS) || !defined(SSE2_PRESENT)
42 #define MEMORY_BARRIER ((void)0)
46 #include <emmintrin.h>
48 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
50 static void *MM_CALLOC(size_t nmemb, size_t size)
52 void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
53 if(ptr != NULL) memset(ptr, 0, nmemb*size);
57 #define MM_FREE _mm_free
59 #define MM_MALLOC(size) malloc(size)
60 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
64 typedef enum DPSOFTRAST_ARRAY_e
66 DPSOFTRAST_ARRAY_POSITION,
67 DPSOFTRAST_ARRAY_COLOR,
68 DPSOFTRAST_ARRAY_TEXCOORD0,
69 DPSOFTRAST_ARRAY_TEXCOORD1,
70 DPSOFTRAST_ARRAY_TEXCOORD2,
71 DPSOFTRAST_ARRAY_TEXCOORD3,
72 DPSOFTRAST_ARRAY_TEXCOORD4,
73 DPSOFTRAST_ARRAY_TEXCOORD5,
74 DPSOFTRAST_ARRAY_TEXCOORD6,
75 DPSOFTRAST_ARRAY_TEXCOORD7,
76 DPSOFTRAST_ARRAY_TOTAL
80 typedef struct DPSOFTRAST_Texture_s
87 DPSOFTRAST_TEXTURE_FILTER filter;
91 int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
95 #define COMMAND_SIZE ALIGN_SIZE
96 #define COMMAND_ALIGN(var) ALIGN(var)
98 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
104 enum { DPSOFTRAST_OPCODE_Reset = 0 };
106 #define DEFCOMMAND(opcodeval, name, fields) \
107 enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
108 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
112 } DPSOFTRAST_Command_##name );
114 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
116 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
120 ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
122 DPSOFTRAST_State_Command_Pool);
124 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
127 unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
132 ALIGN(float coords[4][4]);
133 ALIGN(int ycoords[4]);
134 ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
136 DPSOFTRAST_State_Triangle);
138 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
139 slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
140 data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
141 _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
142 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
144 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
145 slope[0] = (triangle)->attribs[arrayindex][0][0]; \
146 slope[1] = (triangle)->attribs[arrayindex][0][1]; \
147 slope[2] = (triangle)->attribs[arrayindex][0][2]; \
148 slope[3] = (triangle)->attribs[arrayindex][0][3]; \
149 data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
150 data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
151 data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
152 data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
155 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
157 typedef ALIGN(struct DPSOFTRAST_State_Span_s
159 int triangle; // triangle this span was generated by
160 int x; // framebuffer x coord
161 int y; // framebuffer y coord
162 int length; // pixel count
163 int startx; // usable range (according to pixelmask)
164 int endx; // usable range (according to pixelmask)
165 unsigned char *pixelmask; // true for pixels that passed depth test, false for others
167 DPSOFTRAST_State_Span);
169 #define DPSOFTRAST_DRAW_MAXSPANS 1024
171 #define DPSOFTRAST_DRAW_MAXTRIANGLEPOOL 4096
172 #define DPSOFTRAST_DRAW_FLUSHPROCESSTRIANGLES 64
174 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_Pool_s
178 ATOMIC(DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLEPOOL]);
180 DPSOFTRAST_State_Triangle_Pool);
182 #define DPSOFTRAST_VALIDATE_FB 1
183 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
184 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
185 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
187 typedef enum DPSOFTRAST_BLENDMODE_e
189 DPSOFTRAST_BLENDMODE_OPAQUE,
190 DPSOFTRAST_BLENDMODE_ALPHA,
191 DPSOFTRAST_BLENDMODE_ADDALPHA,
192 DPSOFTRAST_BLENDMODE_ADD,
193 DPSOFTRAST_BLENDMODE_INVMOD,
194 DPSOFTRAST_BLENDMODE_MUL,
195 DPSOFTRAST_BLENDMODE_MUL2,
196 DPSOFTRAST_BLENDMODE_SUBALPHA,
197 DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
198 DPSOFTRAST_BLENDMODE_TOTAL
200 DPSOFTRAST_BLENDMODE;
202 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
222 float polygonoffset[2];
225 int shader_permutation;
227 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
229 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
230 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
232 // DPSOFTRAST_VALIDATE_ flags
235 // derived values (DPSOFTRAST_VALIDATE_FB)
237 int fb_clearscissor[4];
239 // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
242 // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
245 ATOMIC(int commandoffset);
254 DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
256 DPSOFTRAST_State_Thread);
258 typedef ATOMIC(struct DPSOFTRAST_State_s
262 unsigned int *fb_depthpixels;
263 unsigned int *fb_colorpixels[4];
266 ALIGN(float fb_viewportcenter[4]);
267 ALIGN(float fb_viewportscale[4]);
270 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
271 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
275 const float *pointer_vertex3f;
276 const float *pointer_color4f;
277 const unsigned char *pointer_color4ub;
278 const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
281 int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
282 int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
283 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
287 float *in_array4f[DPSOFTRAST_ARRAY_TOTAL];
288 float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
289 float *screencoord4f;
292 int shader_permutation;
296 int texture_firstfree;
297 DPSOFTRAST_Texture *texture;
302 const char *errorstring;
305 DPSOFTRAST_State_Thread *threads;
307 SDL_mutex *trianglemutex;
308 SDL_cond *trianglecond;
311 ATOMIC(int drawtriangle);
313 DPSOFTRAST_State_Command_Pool commandpool;
314 DPSOFTRAST_State_Triangle_Pool trianglepool;
318 DPSOFTRAST_State dpsoftrast;
320 extern int dpsoftrast_test;
322 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
323 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
324 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
325 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
326 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
328 void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
330 // calculate framebuffer scissor, viewport, viewport clipped by scissor,
331 // and viewport projection values
334 x1 = thread->scissor[0];
335 x2 = thread->scissor[0] + thread->scissor[2];
336 y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
337 y2 = dpsoftrast.fb_height - thread->scissor[1];
338 if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
340 if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
342 if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
343 thread->fb_clearscissor[0] = x1;
344 thread->fb_clearscissor[1] = y1;
345 thread->fb_clearscissor[2] = x2 - x1;
346 thread->fb_clearscissor[3] = y2 - y1;
349 void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
351 thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
354 void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
356 if (thread->blendsubtract)
358 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
360 #define BLENDFUNC(sfactor, dfactor, blendmode) \
361 case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
362 BLENDFUNC(GL_SRC_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
363 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
368 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
370 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
371 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
372 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
373 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
374 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
375 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
376 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
377 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
378 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
379 BLENDFUNC(GL_SRC_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
380 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
385 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
387 void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
389 mask &= thread->validate;
392 if (mask & DPSOFTRAST_VALIDATE_FB)
394 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
395 DPSOFTRAST_RecalcFB(thread);
397 if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
399 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
400 DPSOFTRAST_RecalcDepthFunc(thread);
402 if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
404 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
405 DPSOFTRAST_RecalcBlendFunc(thread);
409 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
411 if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
412 return &dpsoftrast.texture[index];
416 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
425 int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
426 int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
427 DPSOFTRAST_Texture *texture;
428 if (width*height*depth < 1)
430 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
433 if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
435 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
440 case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
441 case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
442 case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
444 case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
445 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
447 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
452 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
455 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
457 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
462 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
464 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
467 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
469 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
472 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
474 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
477 if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
479 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
482 if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
484 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
488 // find first empty slot in texture array
489 for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
490 if (!dpsoftrast.texture[texnum].bytes)
492 dpsoftrast.texture_firstfree = texnum + 1;
493 if (dpsoftrast.texture_max <= texnum)
495 // expand texture array as needed
496 if (dpsoftrast.texture_max < 1024)
497 dpsoftrast.texture_max = 1024;
499 dpsoftrast.texture_max *= 2;
500 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
502 if (dpsoftrast.texture_end <= texnum)
503 dpsoftrast.texture_end = texnum + 1;
504 texture = &dpsoftrast.texture[texnum];
505 memset(texture, 0, sizeof(*texture));
506 texture->flags = flags;
507 texture->width = width;
508 texture->height = height;
509 texture->depth = depth;
510 texture->sides = sides;
521 s = w * h * d * sides * 4;
522 texture->mipmap[mipmaps][0] = size;
523 texture->mipmap[mipmaps][1] = s;
524 texture->mipmap[mipmaps][2] = w;
525 texture->mipmap[mipmaps][3] = h;
526 texture->mipmap[mipmaps][4] = d;
529 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
535 texture->mipmaps = mipmaps;
536 texture->size = size;
538 // allocate the pixels now
539 texture->bytes = (unsigned char *)MM_CALLOC(1, size);
543 void DPSOFTRAST_Texture_Free(int index)
545 DPSOFTRAST_Texture *texture;
546 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
549 MM_FREE(texture->bytes);
550 texture->bytes = NULL;
551 memset(texture, 0, sizeof(*texture));
552 // adjust the free range and used range
553 if (dpsoftrast.texture_firstfree > index)
554 dpsoftrast.texture_firstfree = index;
555 while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
556 dpsoftrast.texture_end--;
558 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
560 int i, x, y, z, w, layer0, layer1, row0, row1;
561 unsigned char *o, *i0, *i1, *i2, *i3;
562 DPSOFTRAST_Texture *texture;
563 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
564 if (texture->mipmaps <= 1)
566 for (i = 1;i < texture->mipmaps;i++)
568 for (z = 0;z < texture->mipmap[i][4];z++)
572 if (layer1 >= texture->mipmap[i-1][4])
573 layer1 = texture->mipmap[i-1][4]-1;
574 for (y = 0;y < texture->mipmap[i][3];y++)
578 if (row1 >= texture->mipmap[i-1][3])
579 row1 = texture->mipmap[i-1][3]-1;
580 o = texture->bytes + texture->mipmap[i ][0] + 4*((texture->mipmap[i ][3] * z + y ) * texture->mipmap[i ][2]);
581 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
582 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
583 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
584 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
585 w = texture->mipmap[i][2];
588 if (texture->mipmap[i-1][2] > 1)
590 // average 3D texture
591 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
593 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
594 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
595 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
596 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
601 // average 3D mipmap with parent width == 1
602 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
604 o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
605 o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
606 o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
607 o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
613 if (texture->mipmap[i-1][2] > 1)
615 // average 2D texture (common case)
616 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
618 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
619 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
620 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
621 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
626 // 2D texture with parent width == 1
627 o[0] = (i0[0] + i1[0] + 1) >> 1;
628 o[1] = (i0[1] + i1[1] + 1) >> 1;
629 o[2] = (i0[2] + i1[2] + 1) >> 1;
630 o[3] = (i0[3] + i1[3] + 1) >> 1;
637 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
639 DPSOFTRAST_Texture *texture;
641 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
643 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
644 while (blockheight > 0)
646 memcpy(dst, pixels, blockwidth * 4);
647 pixels += blockwidth * 4;
648 dst += texture->mipmap[0][2] * 4;
651 DPSOFTRAST_Texture_CalculateMipmaps(index);
653 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
655 DPSOFTRAST_Texture *texture;
656 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
658 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
659 DPSOFTRAST_Texture_CalculateMipmaps(index);
661 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
663 DPSOFTRAST_Texture *texture;
664 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
665 return texture->mipmap[mip][2];
667 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
669 DPSOFTRAST_Texture *texture;
670 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
671 return texture->mipmap[mip][3];
673 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
675 DPSOFTRAST_Texture *texture;
676 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
677 return texture->mipmap[mip][4];
679 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
681 DPSOFTRAST_Texture *texture;
682 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
684 return texture->bytes + texture->mipmap[mip][0];
686 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
688 DPSOFTRAST_Texture *texture;
689 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
690 if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
692 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
696 texture->filter = filter;
699 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
701 if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
702 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
703 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
705 dpsoftrast.fb_width = width;
706 dpsoftrast.fb_height = height;
707 dpsoftrast.fb_depthpixels = depthpixels;
708 dpsoftrast.fb_colorpixels[0] = colorpixels0;
709 dpsoftrast.fb_colorpixels[1] = colorpixels1;
710 dpsoftrast.fb_colorpixels[2] = colorpixels2;
711 dpsoftrast.fb_colorpixels[3] = colorpixels3;
714 void DPSOFTRAST_Draw_FlushThreads(void);
716 void DPSOFTRAST_Draw_FreeTrianglePool(int space)
718 DPSOFTRAST_State_Thread *thread;
720 int freetriangle = dpsoftrast.trianglepool.freetriangle;
721 int usedtriangles = dpsoftrast.trianglepool.usedtriangles;
722 if (usedtriangles <= DPSOFTRAST_DRAW_MAXTRIANGLEPOOL-space)
725 SDL_LockMutex(dpsoftrast.trianglemutex);
732 for (i = 0; i < dpsoftrast.numthreads; i++)
734 thread = &dpsoftrast.threads[i];
735 triangleoffset = freetriangle - thread->triangleoffset;
736 if (triangleoffset < 0)
737 triangleoffset += DPSOFTRAST_DRAW_MAXTRIANGLEPOOL;
738 if (triangleoffset > usedtriangles)
741 usedtriangles = triangleoffset;
744 if (usedtriangles <= DPSOFTRAST_DRAW_MAXTRIANGLEPOOL-space || waitindex < 0)
747 thread = &dpsoftrast.threads[waitindex];
748 thread->waiting = true;
749 SDL_CondBroadcast(dpsoftrast.trianglecond);
750 SDL_CondWait(thread->waitcond, dpsoftrast.trianglemutex);
751 thread->waiting = false;
755 SDL_UnlockMutex(dpsoftrast.trianglemutex);
757 dpsoftrast.trianglepool.usedtriangles = usedtriangles;
760 void DPSOFTRAST_Draw_SyncCommands(void)
762 DPSOFTRAST_State_Triangle *triangle;
763 if (dpsoftrast.trianglepool.usedtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLEPOOL-1)
765 DPSOFTRAST_Draw_FreeTrianglePool(DPSOFTRAST_DRAW_MAXTRIANGLEPOOL/8);
767 DPSOFTRAST_Draw_FlushThreads();
769 triangle = &dpsoftrast.trianglepool.triangles[dpsoftrast.trianglepool.freetriangle];
770 triangle->commandoffset = dpsoftrast.commandpool.freecommand;
771 triangle->starty = -1;
773 dpsoftrast.trianglepool.freetriangle = dpsoftrast.trianglepool.freetriangle < DPSOFTRAST_DRAW_MAXTRIANGLEPOOL-1 ? dpsoftrast.trianglepool.freetriangle + 1 : 0;
774 dpsoftrast.trianglepool.usedtriangles++;
776 dpsoftrast.drawtriangle = dpsoftrast.trianglepool.freetriangle;
779 void DPSOFTRAST_Draw_FreeCommandPool(int space)
781 DPSOFTRAST_State_Thread *thread;
783 int freecommand = dpsoftrast.commandpool.freecommand;
784 int usedcommands = dpsoftrast.commandpool.usedcommands;
785 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
787 DPSOFTRAST_Draw_SyncCommands();
789 SDL_LockMutex(dpsoftrast.trianglemutex);
796 for (i = 0; i < dpsoftrast.numthreads; i++)
798 thread = &dpsoftrast.threads[i];
799 commandoffset = freecommand - thread->commandoffset;
800 if (commandoffset < 0)
801 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
802 if (commandoffset > usedcommands)
805 usedcommands = commandoffset;
808 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
811 thread = &dpsoftrast.threads[waitindex];
812 thread->waiting = true;
813 SDL_CondBroadcast(dpsoftrast.trianglecond);
814 SDL_CondWait(thread->waitcond, dpsoftrast.trianglemutex);
815 thread->waiting = false;
819 SDL_UnlockMutex(dpsoftrast.trianglemutex);
821 dpsoftrast.commandpool.usedcommands = usedcommands;
824 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
825 ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand(sizeof( DPSOFTRAST_Command_##name ) + ((COMMAND_SIZE - (sizeof( DPSOFTRAST_Command_##name )&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1))))
827 static void *DPSOFTRAST_AllocateCommand(int size)
829 DPSOFTRAST_Command *command;
830 int freecommand = dpsoftrast.commandpool.freecommand;
831 int usedcommands = dpsoftrast.commandpool.usedcommands;
832 int extra = sizeof(DPSOFTRAST_Command);
833 if(DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
834 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
835 if(usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
838 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
840 DPSOFTRAST_Draw_FlushThreads();
842 freecommand = dpsoftrast.commandpool.freecommand;
843 usedcommands = dpsoftrast.commandpool.usedcommands;
845 if(DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
847 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
848 command->opcode = DPSOFTRAST_OPCODE_Reset;
849 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
852 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
854 if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
857 dpsoftrast.commandpool.freecommand = freecommand;
858 dpsoftrast.commandpool.usedcommands = usedcommands + size;
862 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
863 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
865 thread->viewport[0] = command->x;
866 thread->viewport[1] = command->y;
867 thread->viewport[2] = command->width;
868 thread->viewport[3] = command->height;
869 thread->validate |= DPSOFTRAST_VALIDATE_FB;
871 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
873 DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
874 command->opcode = DPSOFTRAST_OPCODE_Viewport;
877 command->width = width;
878 command->height = height;
880 dpsoftrast.viewport[0] = x;
881 dpsoftrast.viewport[1] = y;
882 dpsoftrast.viewport[2] = width;
883 dpsoftrast.viewport[3] = height;
884 dpsoftrast.fb_viewportcenter[1] = dpsoftrast.viewport[0] + 0.5f * dpsoftrast.viewport[2] - 0.5f;
885 dpsoftrast.fb_viewportcenter[2] = dpsoftrast.fb_height - dpsoftrast.viewport[1] - 0.5f * dpsoftrast.viewport[3] - 0.5f;
886 dpsoftrast.fb_viewportcenter[3] = 0.5f;
887 dpsoftrast.fb_viewportcenter[0] = 0.0f;
888 dpsoftrast.fb_viewportscale[1] = 0.5f * dpsoftrast.viewport[2];
889 dpsoftrast.fb_viewportscale[2] = -0.5f * dpsoftrast.viewport[3];
890 dpsoftrast.fb_viewportscale[3] = 0.5f;
891 dpsoftrast.fb_viewportscale[0] = 1.0f;
894 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;)
895 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
897 int i, x1, y1, x2, y2, w, h, x, y, t1, t2;
900 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
901 x1 = thread->fb_clearscissor[0];
902 y1 = thread->fb_clearscissor[1];
903 x2 = thread->fb_clearscissor[2];
904 y2 = thread->fb_clearscissor[1] + thread->fb_clearscissor[3];
905 t1 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
906 t2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
913 // FIXME: honor fb_colormask?
914 c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
915 for (i = 0;i < 4;i++)
917 if (!dpsoftrast.fb_colorpixels[i])
919 for (y = y1;y < y2;y++)
921 p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
922 for (x = x1;x < x2;x++)
927 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
929 DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
930 command->opcode = DPSOFTRAST_OPCODE_ClearColor;
937 DEFCOMMAND(3, ClearDepth, float depth;)
938 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
940 int x1, y1, x2, y2, w, h, x, y, t1, t2;
943 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
944 x1 = thread->fb_clearscissor[0];
945 y1 = thread->fb_clearscissor[1];
946 x2 = thread->fb_clearscissor[2];
947 y2 = thread->fb_clearscissor[1] + thread->fb_clearscissor[3];
948 t1 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
949 t2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
956 c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
957 for (y = y1;y < y2;y++)
959 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
960 for (x = x1;x < x2;x++)
964 void DPSOFTRAST_ClearDepth(float d)
966 DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
967 command->opcode = DPSOFTRAST_OPCODE_ClearDepth;
971 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
972 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
974 thread->colormask[0] = command->r != 0;
975 thread->colormask[1] = command->g != 0;
976 thread->colormask[2] = command->b != 0;
977 thread->colormask[3] = command->a != 0;
978 thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
980 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
982 DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
983 command->opcode = DPSOFTRAST_OPCODE_ColorMask;
990 DEFCOMMAND(5, DepthTest, int enable;)
991 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
993 thread->depthtest = command->enable;
994 thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
996 void DPSOFTRAST_DepthTest(int enable)
998 DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
999 command->opcode = DPSOFTRAST_OPCODE_DepthTest;
1000 command->enable = enable;
1003 DEFCOMMAND(6, ScissorTest, int enable;)
1004 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1006 thread->scissortest = command->enable;
1007 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1009 void DPSOFTRAST_ScissorTest(int enable)
1011 DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1012 command->opcode = DPSOFTRAST_OPCODE_ScissorTest;
1013 command->enable = enable;
1016 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1017 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1019 thread->scissor[0] = command->x;
1020 thread->scissor[1] = command->y;
1021 thread->scissor[2] = command->width;
1022 thread->scissor[3] = command->height;
1023 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1025 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1027 DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1028 command->opcode = DPSOFTRAST_OPCODE_Scissor;
1031 command->width = width;
1032 command->height = height;
1035 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1036 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1038 thread->blendfunc[0] = command->sfactor;
1039 thread->blendfunc[1] = command->dfactor;
1040 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1042 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1044 DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1045 command->opcode = DPSOFTRAST_OPCODE_BlendFunc;
1046 command->sfactor = sfactor;
1047 command->dfactor = dfactor;
1050 DEFCOMMAND(9, BlendSubtract, int enable;)
1051 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1053 thread->blendsubtract = command->enable;
1054 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1056 void DPSOFTRAST_BlendSubtract(int enable)
1058 DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1059 command->opcode = DPSOFTRAST_OPCODE_BlendSubtract;
1060 command->enable = enable;
1063 DEFCOMMAND(10, DepthMask, int enable;)
1064 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1066 thread->depthmask = command->enable;
1068 void DPSOFTRAST_DepthMask(int enable)
1070 DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1071 command->opcode = DPSOFTRAST_OPCODE_DepthMask;
1072 command->enable = enable;
1075 DEFCOMMAND(11, DepthFunc, int func;)
1076 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1078 thread->depthfunc = command->func;
1080 void DPSOFTRAST_DepthFunc(int func)
1082 DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1083 command->opcode = DPSOFTRAST_OPCODE_DepthFunc;
1084 command->func = func;
1087 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1088 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1090 thread->depthrange[0] = command->nearval;
1091 thread->depthrange[1] = command->farval;
1093 void DPSOFTRAST_DepthRange(float nearval, float farval)
1095 DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1096 command->opcode = DPSOFTRAST_OPCODE_DepthRange;
1097 command->nearval = nearval;
1098 command->farval = farval;
1101 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1102 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1104 thread->polygonoffset[0] = command->alongnormal;
1105 thread->polygonoffset[1] = command->intoview;
1107 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1109 DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1110 command->opcode = DPSOFTRAST_OPCODE_PolygonOffset;
1111 command->alongnormal = alongnormal;
1112 command->intoview = intoview;
1115 void DPSOFTRAST_CullFace(int mode)
1117 dpsoftrast.cullface = mode;
1120 DEFCOMMAND(15, AlphaTest, int enable;)
1121 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1123 thread->alphatest = command->enable;
1125 void DPSOFTRAST_AlphaTest(int enable)
1127 DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1128 command->opcode = DPSOFTRAST_OPCODE_AlphaTest;
1129 command->enable = enable;
1132 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1133 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1135 thread->alphafunc = command->func;
1136 thread->alphavalue = command->ref;
1138 void DPSOFTRAST_AlphaFunc(int func, float ref)
1140 DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1141 command->opcode = DPSOFTRAST_OPCODE_AlphaFunc;
1142 command->func = func;
1146 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1148 dpsoftrast.color[0] = r;
1149 dpsoftrast.color[1] = g;
1150 dpsoftrast.color[2] = b;
1151 dpsoftrast.color[3] = a;
1154 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1156 int outstride = blockwidth * 4;
1157 int instride = dpsoftrast.fb_width * 4;
1160 int bx2 = blockx + blockwidth;
1161 int by2 = blocky + blockheight;
1166 unsigned char *inpixels;
1170 if (bx1 < 0) bx1 = 0;
1171 if (by1 < 0) by1 = 0;
1172 if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1173 if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1176 inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1177 if (dpsoftrast.bigendian)
1179 for (y = by1;y < by2;y++)
1181 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1182 o = (unsigned char *)outpixels + (y - by1) * outstride;
1183 for (x = bx1;x < bx2;x++)
1196 for (y = by1;y < by2;y++)
1198 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1199 o = (unsigned char *)outpixels + (y - by1) * outstride;
1205 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1209 int tx2 = tx + width;
1210 int ty2 = ty + height;
1213 int sx2 = sx + width;
1214 int sy2 = sy + height;
1224 unsigned int *spixels;
1225 unsigned int *tpixels;
1226 DPSOFTRAST_Texture *texture;
1227 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1228 if (mip < 0 || mip >= texture->mipmaps) return;
1230 spixels = dpsoftrast.fb_colorpixels[0];
1231 swidth = dpsoftrast.fb_width;
1232 sheight = dpsoftrast.fb_height;
1233 tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1234 twidth = texture->mipmap[mip][2];
1235 theight = texture->mipmap[mip][3];
1236 if (tx1 < 0) tx1 = 0;
1237 if (ty1 < 0) ty1 = 0;
1238 if (tx2 > twidth) tx2 = twidth;
1239 if (ty2 > theight) ty2 = theight;
1240 if (sx1 < 0) sx1 = 0;
1241 if (sy1 < 0) sy1 = 0;
1242 if (sx2 > swidth) sx2 = swidth;
1243 if (sy2 > sheight) sy2 = sheight;
1248 if (tw > sw) tw = sw;
1249 if (th > sh) th = sh;
1250 if (tw < 1 || th < 1)
1252 for (y = 0;y < th;y++)
1253 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 + y) * swidth + sx1), tw*4);
1254 if (texture->mipmaps > 1)
1255 DPSOFTRAST_Texture_CalculateMipmaps(index);
1258 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1259 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1261 thread->texbound[command->unitnum] = command->texture;
1263 void DPSOFTRAST_SetTexture(int unitnum, int index)
1265 DPSOFTRAST_Command_SetTexture *command;
1266 DPSOFTRAST_Texture *texture;
1267 if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1269 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1272 texture = DPSOFTRAST_Texture_GetByIndex(index);
1273 if (index && !texture)
1275 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1279 command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1280 command->opcode = DPSOFTRAST_OPCODE_SetTexture;
1281 command->unitnum = unitnum;
1282 command->texture = texture;
1284 dpsoftrast.texbound[unitnum] = texture;
1287 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1289 dpsoftrast.pointer_vertex3f = vertex3f;
1290 dpsoftrast.stride_vertex = stride;
1292 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1294 dpsoftrast.pointer_color4f = color4f;
1295 dpsoftrast.pointer_color4ub = NULL;
1296 dpsoftrast.stride_color = stride;
1298 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1300 dpsoftrast.pointer_color4f = NULL;
1301 dpsoftrast.pointer_color4ub = color4ub;
1302 dpsoftrast.stride_color = stride;
1304 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1306 dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1307 dpsoftrast.components_texcoord[unitnum] = numcomponents;
1308 dpsoftrast.stride_texcoord[unitnum] = stride;
1311 DEFCOMMAND(18, SetShader, int mode; int permutation;)
1312 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1314 thread->shader_mode = command->mode;
1315 thread->shader_permutation = command->permutation;
1317 void DPSOFTRAST_SetShader(int mode, int permutation)
1319 DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1320 command->opcode = DPSOFTRAST_OPCODE_SetShader;
1321 command->mode = mode;
1322 command->permutation = permutation;
1324 dpsoftrast.shader_mode = mode;
1325 dpsoftrast.shader_permutation = permutation;
1328 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1329 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1331 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1333 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1335 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1336 command->opcode = DPSOFTRAST_OPCODE_Uniform4f;
1337 command->index = index;
1338 command->val[0] = v0;
1339 command->val[1] = v1;
1340 command->val[2] = v2;
1341 command->val[3] = v3;
1343 dpsoftrast.uniform4f[index*4+0] = v0;
1344 dpsoftrast.uniform4f[index*4+1] = v1;
1345 dpsoftrast.uniform4f[index*4+2] = v2;
1346 dpsoftrast.uniform4f[index*4+3] = v3;
1348 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1350 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1351 command->opcode = DPSOFTRAST_OPCODE_Uniform4f;
1352 command->index = index;
1353 memcpy(command->val, v, sizeof(command->val));
1355 memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1358 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1359 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1361 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1363 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1367 for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1369 __m128 m0, m1, m2, m3;
1370 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1371 command->opcode = DPSOFTRAST_OPCODE_UniformMatrix4f;
1372 command->index = index;
1373 if (((size_t)v)&(ALIGN_SIZE-1))
1375 m0 = _mm_loadu_ps(v);
1376 m1 = _mm_loadu_ps(v+4);
1377 m2 = _mm_loadu_ps(v+8);
1378 m3 = _mm_loadu_ps(v+12);
1382 m0 = _mm_load_ps(v);
1383 m1 = _mm_load_ps(v+4);
1384 m2 = _mm_load_ps(v+8);
1385 m3 = _mm_load_ps(v+12);
1389 __m128 t0, t1, t2, t3;
1390 t0 = _mm_unpacklo_ps(m0, m1);
1391 t1 = _mm_unpacklo_ps(m2, m3);
1392 t2 = _mm_unpackhi_ps(m0, m1);
1393 t3 = _mm_unpackhi_ps(m2, m3);
1394 m0 = _mm_movelh_ps(t0, t1);
1395 m1 = _mm_movehl_ps(t1, t0);
1396 m2 = _mm_movelh_ps(t2, t3);
1397 m3 = _mm_movehl_ps(t3, t2);
1399 _mm_store_ps(command->val, m0);
1400 _mm_store_ps(command->val+4, m1);
1401 _mm_store_ps(command->val+8, m2);
1402 _mm_store_ps(command->val+12, m3);
1403 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1404 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1405 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1406 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1411 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1412 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1414 thread->uniform1i[command->index] = command->val;
1416 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1418 DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1419 command->opcode = DPSOFTRAST_OPCODE_Uniform1i;
1420 command->index = index;
1423 dpsoftrast.uniform1i[command->index] = i0;
1427 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1429 float *end = dst + size*4;
1430 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1434 _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1443 _mm_store_ps(dst, _mm_load_ps((const float *)src));
1450 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1452 float *end = dst + size*4;
1453 if (stride == sizeof(float[3]))
1455 float *end4 = dst + (size&~3)*4;
1456 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1460 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv;
1461 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1462 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1463 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1464 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1465 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1466 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1467 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1468 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1469 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1470 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1471 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1472 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1474 src += 4*sizeof(float[3]);
1481 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1482 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1483 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1484 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1485 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1486 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1487 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1488 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1489 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1490 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1491 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1492 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1493 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1495 src += 4*sizeof(float[3]);
1499 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1503 __m128 v = _mm_loadu_ps((const float *)src);
1504 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1505 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1506 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1507 _mm_store_ps(dst, v);
1516 __m128 v = _mm_load_ps((const float *)src);
1517 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1518 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1519 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1520 _mm_store_ps(dst, v);
1527 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1529 float *end = dst + size*4;
1530 __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1531 if (stride == sizeof(float[2]))
1533 float *end2 = dst + (size&~1)*4;
1534 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1538 __m128 v = _mm_loadu_ps((const float *)src);
1539 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1540 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1542 src += 2*sizeof(float[2]);
1549 __m128 v = _mm_load_ps((const float *)src);
1550 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1551 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1553 src += 2*sizeof(float[2]);
1559 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1565 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1567 float *end = dst + size*4;
1568 __m128 scale = _mm_set1_ps(1.0f/255.0f);
1569 if (stride == sizeof(unsigned char[4]))
1571 float *end4 = dst + (size&~3)*4;
1572 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1576 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1577 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1578 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1579 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1580 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1582 src += 4*sizeof(unsigned char[4]);
1589 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1590 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1591 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1592 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1593 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1595 src += 4*sizeof(unsigned char[4]);
1601 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1602 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1608 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1610 float *end = dst + 4*size;
1611 __m128 v = _mm_loadu_ps(src);
1614 _mm_store_ps(dst, v);
1620 void DPSOFTRAST_Draw_LoadVertices(int firstvertex, int numvertices, bool needcolors)
1629 const unsigned char *b;
1630 dpsoftrast.numvertices = numvertices;
1631 if (dpsoftrast.maxvertices < dpsoftrast.numvertices)
1633 if (dpsoftrast.maxvertices < 4096)
1634 dpsoftrast.maxvertices = 4096;
1635 while (dpsoftrast.maxvertices < dpsoftrast.numvertices)
1636 dpsoftrast.maxvertices *= 2;
1637 if (dpsoftrast.in_array4f[0])
1638 MM_FREE(dpsoftrast.in_array4f[0]);
1639 data = (float *)MM_CALLOC(1, dpsoftrast.maxvertices * sizeof(float[4])*(DPSOFTRAST_ARRAY_TOTAL*2 + 1));
1640 for (i = 0;i < DPSOFTRAST_ARRAY_TOTAL;i++, data += dpsoftrast.maxvertices * 4)
1641 dpsoftrast.in_array4f[i] = data;
1642 for (i = 0;i < DPSOFTRAST_ARRAY_TOTAL;i++, data += dpsoftrast.maxvertices * 4)
1643 dpsoftrast.post_array4f[i] = data;
1644 dpsoftrast.screencoord4f = data;
1645 data += dpsoftrast.maxvertices * 4;
1647 stride = dpsoftrast.stride_vertex;
1648 v = (const float *)((unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride);
1649 p = dpsoftrast.in_array4f[0];
1650 DPSOFTRAST_Load3fTo4f(p, (const unsigned char *)v, numvertices, stride);
1653 if (dpsoftrast.pointer_color4f)
1655 stride = dpsoftrast.stride_color;
1656 v = (const float *)((const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride);
1657 p = dpsoftrast.in_array4f[1];
1658 DPSOFTRAST_Load4fTo4f(p, (const unsigned char *)v, numvertices, stride);
1660 else if (dpsoftrast.pointer_color4ub)
1662 stride = dpsoftrast.stride_color;
1663 b = (const unsigned char *)((const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride);
1664 p = dpsoftrast.in_array4f[1];
1665 DPSOFTRAST_Load4bTo4f(p, b, numvertices, stride);
1669 p = dpsoftrast.in_array4f[1];
1670 DPSOFTRAST_Fill4f(p, dpsoftrast.color, numvertices);
1673 for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL-2;j++)
1675 if (dpsoftrast.pointer_texcoordf[j])
1677 stride = dpsoftrast.stride_texcoord[j];
1678 v = (const float *)((const unsigned char *)dpsoftrast.pointer_texcoordf[j] + firstvertex * stride);
1679 p = dpsoftrast.in_array4f[j+2];
1680 switch(dpsoftrast.components_texcoord[j])
1683 DPSOFTRAST_Load2fTo4f(p, (const unsigned char *)v, numvertices, stride);
1686 DPSOFTRAST_Load3fTo4f(p, (const unsigned char *)v, numvertices, stride);
1689 DPSOFTRAST_Load4fTo4f(p, (const unsigned char *)v, numvertices, stride);
1697 void DPSOFTRAST_Array_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1700 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1701 __m128 m0, m1, m2, m3;
1702 float *end = out4f + numitems*4;
1703 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1705 // fast case for identity matrix
1706 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1709 m0 = _mm_loadu_ps(inmatrix16f);
1710 m1 = _mm_loadu_ps(inmatrix16f + 4);
1711 m2 = _mm_loadu_ps(inmatrix16f + 8);
1712 m3 = _mm_loadu_ps(inmatrix16f + 12);
1713 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1717 __m128 v = _mm_loadu_ps(in4f);
1719 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1720 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1721 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1722 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1731 __m128 v = _mm_load_ps(in4f);
1733 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1734 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1735 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1736 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1744 void DPSOFTRAST_Array_Copy(float *out4f, const float *in4f, int numitems)
1746 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1750 static __m128 DPSOFTRAST_Draw_ProjectVertex(__m128 v)
1752 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1753 __m128 w = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3));
1754 v = _mm_move_ss(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f));
1755 v = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, v), w));
1756 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1761 void DPSOFTRAST_Array_Project(float *out4f, float *screen4f, const float *in4f, int numitems)
1764 float *end = out4f + numitems*4;
1765 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1768 __m128 v = _mm_load_ps(in4f), w = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3));
1769 _mm_store_ps(out4f, v);
1770 v = _mm_move_ss(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f));
1771 v = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, v), w));
1772 _mm_store_ps(screen4f, _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1)));
1780 void DPSOFTRAST_Array_TransformProject(float *out4f, float *screen4f, const float *in4f, int numitems, const float *inmatrix16f)
1783 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1784 __m128 m0, m1, m2, m3, viewportcenter, viewportscale;
1785 float *end = out4f + numitems*4;
1786 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1788 DPSOFTRAST_Array_Project(out4f, screen4f, in4f, numitems);
1791 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1792 viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1793 m0 = _mm_loadu_ps(inmatrix16f);
1794 m1 = _mm_loadu_ps(inmatrix16f + 4);
1795 m2 = _mm_loadu_ps(inmatrix16f + 8);
1796 m3 = _mm_loadu_ps(inmatrix16f + 12);
1797 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1801 __m128 v = _mm_loadu_ps(in4f), w;
1802 v = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1803 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1804 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1805 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3))));
1806 _mm_store_ps(out4f, v);
1807 w = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3));
1808 v = _mm_move_ss(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f));
1809 v = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, v), w));
1810 _mm_store_ps(screen4f, _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1)));
1820 __m128 v = _mm_load_ps(in4f), w;
1821 v = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1822 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1823 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1824 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3))));
1825 _mm_store_ps(out4f, v);
1826 w = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3));
1827 v = _mm_move_ss(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f));
1828 v = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, v), w));
1829 _mm_store_ps(screen4f, _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1)));
1838 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1841 int startx = span->startx;
1842 int endx = span->endx;
1843 float wslope = triangle->w[0];
1844 float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1845 float endz = 1.0f / (w + wslope * startx);
1846 for (x = startx;x < endx;)
1848 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1850 if(nextsub >= endx) nextsub = endsub = endx-1;
1851 endz = 1.0f / (w + wslope * nextsub);
1852 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1853 for (; x <= endsub; x++, z += dz)
1858 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
1861 int startx = span->startx;
1862 int endx = span->endx;
1865 unsigned char * RESTRICT pixelmask = span->pixelmask;
1866 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1869 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
1870 // handle alphatest now (this affects depth writes too)
1871 if (thread->alphatest)
1872 for (x = startx;x < endx;x++)
1873 if (in4f[x*4+3] < 0.5f)
1874 pixelmask[x] = false;
1875 // FIXME: this does not handle bigendian
1876 switch(thread->fb_blendmode)
1878 case DPSOFTRAST_BLENDMODE_OPAQUE:
1879 for (x = startx;x < endx;x++)
1883 d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
1884 d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
1885 d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
1886 d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
1887 pixel[x*4+0] = d[0];
1888 pixel[x*4+1] = d[1];
1889 pixel[x*4+2] = d[2];
1890 pixel[x*4+3] = d[3];
1893 case DPSOFTRAST_BLENDMODE_ALPHA:
1894 for (x = startx;x < endx;x++)
1898 a = in4f[x*4+3] * 255.0f;
1899 b = 1.0f - in4f[x*4+3];
1900 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
1901 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
1902 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
1903 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
1904 pixel[x*4+0] = d[0];
1905 pixel[x*4+1] = d[1];
1906 pixel[x*4+2] = d[2];
1907 pixel[x*4+3] = d[3];
1910 case DPSOFTRAST_BLENDMODE_ADDALPHA:
1911 for (x = startx;x < endx;x++)
1915 a = in4f[x*4+3] * 255.0f;
1916 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
1917 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
1918 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
1919 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
1920 pixel[x*4+0] = d[0];
1921 pixel[x*4+1] = d[1];
1922 pixel[x*4+2] = d[2];
1923 pixel[x*4+3] = d[3];
1926 case DPSOFTRAST_BLENDMODE_ADD:
1927 for (x = startx;x < endx;x++)
1931 d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
1932 d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
1933 d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
1934 d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
1935 pixel[x*4+0] = d[0];
1936 pixel[x*4+1] = d[1];
1937 pixel[x*4+2] = d[2];
1938 pixel[x*4+3] = d[3];
1941 case DPSOFTRAST_BLENDMODE_INVMOD:
1942 for (x = startx;x < endx;x++)
1946 d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
1947 d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
1948 d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
1949 d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
1950 pixel[x*4+0] = d[0];
1951 pixel[x*4+1] = d[1];
1952 pixel[x*4+2] = d[2];
1953 pixel[x*4+3] = d[3];
1956 case DPSOFTRAST_BLENDMODE_MUL:
1957 for (x = startx;x < endx;x++)
1961 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
1962 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
1963 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
1964 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
1965 pixel[x*4+0] = d[0];
1966 pixel[x*4+1] = d[1];
1967 pixel[x*4+2] = d[2];
1968 pixel[x*4+3] = d[3];
1971 case DPSOFTRAST_BLENDMODE_MUL2:
1972 for (x = startx;x < endx;x++)
1976 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
1977 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
1978 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
1979 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
1980 pixel[x*4+0] = d[0];
1981 pixel[x*4+1] = d[1];
1982 pixel[x*4+2] = d[2];
1983 pixel[x*4+3] = d[3];
1986 case DPSOFTRAST_BLENDMODE_SUBALPHA:
1987 for (x = startx;x < endx;x++)
1991 a = in4f[x*4+3] * -255.0f;
1992 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
1993 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
1994 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
1995 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
1996 pixel[x*4+0] = d[0];
1997 pixel[x*4+1] = d[1];
1998 pixel[x*4+2] = d[2];
1999 pixel[x*4+3] = d[3];
2002 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2003 for (x = startx;x < endx;x++)
2008 b = 1.0f - in4f[x*4+3];
2009 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2010 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2011 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2012 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2013 pixel[x*4+0] = d[0];
2014 pixel[x*4+1] = d[1];
2015 pixel[x*4+2] = d[2];
2016 pixel[x*4+3] = d[3];
2022 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2026 int startx = span->startx;
2027 int endx = span->endx;
2028 const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2029 unsigned char * RESTRICT pixelmask = span->pixelmask;
2030 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2031 unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2034 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2035 pixeli += span->y * dpsoftrast.fb_width + span->x;
2036 // handle alphatest now (this affects depth writes too)
2037 if (thread->alphatest)
2038 for (x = startx;x < endx;x++)
2039 if (in4ub[x*4+3] < 0.5f)
2040 pixelmask[x] = false;
2041 // FIXME: this does not handle bigendian
2042 switch(thread->fb_blendmode)
2044 case DPSOFTRAST_BLENDMODE_OPAQUE:
2045 for (x = startx;x + 4 <= endx;)
2047 if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2049 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2063 case DPSOFTRAST_BLENDMODE_ALPHA:
2064 #define FINISHBLEND(blend2, blend1) \
2065 for (x = startx;x + 2 <= endx;x += 2) \
2068 switch (*(const unsigned short*)&pixelmask[x]) \
2071 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2072 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2074 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2077 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2078 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2080 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2083 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2084 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2086 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2091 for(;x < endx; x++) \
2094 if (!pixelmask[x]) \
2096 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2097 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2099 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2103 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2104 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2106 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2107 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2110 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2112 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2113 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2115 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2116 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2119 case DPSOFTRAST_BLENDMODE_ADD:
2120 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2122 case DPSOFTRAST_BLENDMODE_INVMOD:
2124 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2126 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2129 case DPSOFTRAST_BLENDMODE_MUL:
2130 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2132 case DPSOFTRAST_BLENDMODE_MUL2:
2133 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2135 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2137 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2138 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2140 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2141 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2144 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2146 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2147 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2149 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2150 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2157 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2160 int startx = span->startx;
2161 int endx = span->endx;
2166 float tc[2], endtc[2];
2168 unsigned int tci[2];
2169 unsigned int tci1[2];
2170 unsigned int tcimin[2];
2171 unsigned int tcimax[2];
2176 const unsigned char * RESTRICT pixelbase;
2177 const unsigned char * RESTRICT pixel[4];
2178 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2179 // if no texture is bound, just fill it with white
2182 for (x = startx;x < endx;x++)
2184 out4f[x*4+0] = 1.0f;
2185 out4f[x*4+1] = 1.0f;
2186 out4f[x*4+2] = 1.0f;
2187 out4f[x*4+3] = 1.0f;
2191 mip = triangle->mip[texunitindex];
2192 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2193 // if this mipmap of the texture is 1 pixel, just fill it with that color
2194 if (texture->mipmap[mip][1] == 4)
2196 c[0] = texture->bytes[2] * (1.0f/255.0f);
2197 c[1] = texture->bytes[1] * (1.0f/255.0f);
2198 c[2] = texture->bytes[0] * (1.0f/255.0f);
2199 c[3] = texture->bytes[3] * (1.0f/255.0f);
2200 for (x = startx;x < endx;x++)
2202 out4f[x*4+0] = c[0];
2203 out4f[x*4+1] = c[1];
2204 out4f[x*4+2] = c[2];
2205 out4f[x*4+3] = c[3];
2209 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2210 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2211 flags = texture->flags;
2212 tcscale[0] = texture->mipmap[mip][2];
2213 tcscale[1] = texture->mipmap[mip][3];
2214 tciwidth = texture->mipmap[mip][2];
2217 tcimax[0] = texture->mipmap[mip][2]-1;
2218 tcimax[1] = texture->mipmap[mip][3]-1;
2219 tciwrapmask[0] = texture->mipmap[mip][2]-1;
2220 tciwrapmask[1] = texture->mipmap[mip][3]-1;
2221 endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
2222 endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
2223 for (x = startx;x < endx;)
2225 unsigned int subtc[2];
2226 unsigned int substep[2];
2227 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2228 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2231 nextsub = endsub = endx-1;
2232 if(x < nextsub) subscale = 65536.0f / (nextsub - x);
2236 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
2237 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
2238 substep[0] = (endtc[0] - tc[0]) * subscale;
2239 substep[1] = (endtc[1] - tc[1]) * subscale;
2240 subtc[0] = tc[0] * (1<<16);
2241 subtc[1] = tc[1] * (1<<16);
2244 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2246 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2248 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2249 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2250 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2251 tci[0] = subtc[0]>>16;
2252 tci[1] = subtc[1]>>16;
2253 tci1[0] = tci[0] + 1;
2254 tci1[1] = tci[1] + 1;
2255 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2256 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2257 tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2258 tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2259 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2260 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2261 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2262 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2263 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2264 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2265 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2266 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2267 out4f[x*4+0] = c[0];
2268 out4f[x*4+1] = c[1];
2269 out4f[x*4+2] = c[2];
2270 out4f[x*4+3] = c[3];
2275 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2277 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2278 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2279 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2280 tci[0] = subtc[0]>>16;
2281 tci[1] = subtc[1]>>16;
2282 tci1[0] = tci[0] + 1;
2283 tci1[1] = tci[1] + 1;
2284 tci[0] &= tciwrapmask[0];
2285 tci[1] &= tciwrapmask[1];
2286 tci1[0] &= tciwrapmask[0];
2287 tci1[1] &= tciwrapmask[1];
2288 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2289 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2290 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2291 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2292 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2293 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2294 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2295 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2296 out4f[x*4+0] = c[0];
2297 out4f[x*4+1] = c[1];
2298 out4f[x*4+2] = c[2];
2299 out4f[x*4+3] = c[3];
2303 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2305 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2307 tci[0] = subtc[0]>>16;
2308 tci[1] = subtc[1]>>16;
2309 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2310 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2311 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2312 c[0] = pixel[0][2] * (1.0f / 255.0f);
2313 c[1] = pixel[0][1] * (1.0f / 255.0f);
2314 c[2] = pixel[0][0] * (1.0f / 255.0f);
2315 c[3] = pixel[0][3] * (1.0f / 255.0f);
2316 out4f[x*4+0] = c[0];
2317 out4f[x*4+1] = c[1];
2318 out4f[x*4+2] = c[2];
2319 out4f[x*4+3] = c[3];
2324 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2326 tci[0] = subtc[0]>>16;
2327 tci[1] = subtc[1]>>16;
2328 tci[0] &= tciwrapmask[0];
2329 tci[1] &= tciwrapmask[1];
2330 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2331 c[0] = pixel[0][2] * (1.0f / 255.0f);
2332 c[1] = pixel[0][1] * (1.0f / 255.0f);
2333 c[2] = pixel[0][0] * (1.0f / 255.0f);
2334 c[3] = pixel[0][3] * (1.0f / 255.0f);
2335 out4f[x*4+0] = c[0];
2336 out4f[x*4+1] = c[1];
2337 out4f[x*4+2] = c[2];
2338 out4f[x*4+3] = c[3];
2344 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2348 int startx = span->startx;
2349 int endx = span->endx;
2351 __m128 data, slope, tcscale;
2352 __m128i tcsize, tcmask, tcoffset, tcmax;
2354 __m128i subtc, substep, endsubtc;
2357 unsigned int *outi = (unsigned int *)out4ub;
2358 const unsigned char * RESTRICT pixelbase;
2359 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2360 // if no texture is bound, just fill it with white
2363 memset(out4ub + startx*4, 255, span->length*4);
2366 mip = triangle->mip[texunitindex];
2367 pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2368 // if this mipmap of the texture is 1 pixel, just fill it with that color
2369 if (texture->mipmap[mip][1] == 4)
2371 unsigned int k = *((const unsigned int *)pixelbase);
2372 for (x = startx;x < endx;x++)
2376 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2377 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2378 flags = texture->flags;
2379 tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2380 tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2381 tcscale = _mm_cvtepi32_ps(tcsize);
2382 data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2383 slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2384 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
2385 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2386 tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2387 tcmax = filter ? _mm_packs_epi32(tcmask, tcmask) : _mm_slli_epi32(tcmask, 16);
2388 for (x = startx;x < endx;)
2390 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2391 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2394 nextsub = endsub = endx-1;
2395 if(x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2399 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
2400 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2401 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2402 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2403 substep = _mm_slli_epi32(substep, 1);
2406 __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2407 if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2409 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2411 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2412 tci = _mm_madd_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 0x10000, 0, 0x10000)), tcoffset);
2413 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[_mm_cvtsi128_si32(tci)]), _mm_setzero_si128());
2414 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))]), _mm_setzero_si128());
2415 pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), _mm_setzero_si128());
2416 pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))]), _mm_setzero_si128());
2417 fracm = _mm_srli_epi16(subtc, 1);
2418 pix1 = _mm_add_epi16(pix1,
2419 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2420 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2421 pix3 = _mm_add_epi16(pix3,
2422 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2423 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2424 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2425 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2426 pix2 = _mm_add_epi16(pix2,
2427 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2428 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2429 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2433 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2434 tci = _mm_madd_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 0x10000, 0, 0)), tcoffset);
2435 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[_mm_cvtsi128_si32(tci)]), _mm_setzero_si128());
2436 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))]), _mm_setzero_si128());
2437 fracm = _mm_srli_epi16(subtc, 1);
2438 pix1 = _mm_add_epi16(pix1,
2439 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2440 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2441 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2442 pix1 = _mm_add_epi16(pix1,
2443 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2444 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2445 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2449 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2451 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2453 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2454 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2455 tci = _mm_madd_epi16(tci, tcoffset);
2456 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2457 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2458 _mm_setzero_si128());
2459 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2460 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2461 _mm_setzero_si128());
2462 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2463 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2464 tci = _mm_madd_epi16(tci, tcoffset);
2465 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2466 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2467 _mm_setzero_si128());
2468 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2469 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2470 _mm_setzero_si128());
2471 fracm = _mm_srli_epi16(subtc, 1);
2472 pix1 = _mm_add_epi16(pix1,
2473 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2474 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2475 pix3 = _mm_add_epi16(pix3,
2476 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2477 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2478 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2479 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2480 pix2 = _mm_add_epi16(pix2,
2481 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2482 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2483 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2487 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2488 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2489 tci = _mm_madd_epi16(tci, tcoffset);
2490 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2491 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2492 _mm_setzero_si128());
2493 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2494 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2495 _mm_setzero_si128());
2496 fracm = _mm_srli_epi16(subtc, 1);
2497 pix1 = _mm_add_epi16(pix1,
2498 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2499 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2500 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2501 pix1 = _mm_add_epi16(pix1,
2502 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2503 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2504 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2510 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2512 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2513 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2514 tci = _mm_madd_epi16(tci, tcoffset);
2515 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2516 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2517 _mm_setzero_si128());
2518 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2519 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2520 _mm_setzero_si128());
2521 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2522 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2523 tci = _mm_madd_epi16(tci, tcoffset);
2524 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2525 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2526 _mm_setzero_si128());
2527 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2528 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2529 _mm_setzero_si128());
2530 fracm = _mm_srli_epi16(subtc, 1);
2531 pix1 = _mm_add_epi16(pix1,
2532 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2533 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2534 pix3 = _mm_add_epi16(pix3,
2535 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2536 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2537 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2538 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2539 pix2 = _mm_add_epi16(pix2,
2540 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2541 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2542 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2546 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2547 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2548 tci = _mm_madd_epi16(tci, tcoffset);
2549 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2550 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2551 _mm_setzero_si128());
2552 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2553 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2554 _mm_setzero_si128());
2555 fracm = _mm_srli_epi16(subtc, 1);
2556 pix1 = _mm_add_epi16(pix1,
2557 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2558 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2559 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2560 pix1 = _mm_add_epi16(pix1,
2561 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2562 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2563 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2570 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2572 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2574 __m128i tci = _mm_min_epi16(_mm_max_epi16(subtc, _mm_setzero_si128()), tcmax);
2575 tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(tci, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2576 tci = _mm_madd_epi16(tci, tcoffset);
2577 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2578 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))];
2582 __m128i tci = _mm_min_epi16(_mm_max_epi16(subtc, _mm_setzero_si128()), tcmax);
2583 tci = _mm_shufflelo_epi16(tci, _MM_SHUFFLE(3, 1, 3, 1));
2584 tci = _mm_madd_epi16(tci, tcoffset);
2585 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2591 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2593 __m128i tci = _mm_and_si128(subtc, tcmax);
2594 tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(tci, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2595 tci = _mm_madd_epi16(tci, tcoffset);
2596 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2597 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))];
2601 __m128i tci = _mm_and_si128(subtc, tcmax);
2602 tci = _mm_shufflelo_epi16(tci, _MM_SHUFFLE(3, 1, 3, 1));
2603 tci = _mm_madd_epi16(tci, tcoffset);
2604 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2613 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2616 memset(out4ub, 255, span->length*4);
2619 float DPSOFTRAST_SampleShadowmap(const float *vector)
2625 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2628 int startx = span->startx;
2629 int endx = span->endx;
2634 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2635 for (x = startx;x < endx;x++)
2638 c[0] = (data[0] + slope[0]*x) * z;
2639 c[1] = (data[1] + slope[1]*x) * z;
2640 c[2] = (data[2] + slope[2]*x) * z;
2641 c[3] = (data[3] + slope[3]*x) * z;
2642 out4f[x*4+0] = in4f[x*4+0] * c[0];
2643 out4f[x*4+1] = in4f[x*4+1] * c[1];
2644 out4f[x*4+2] = in4f[x*4+2] * c[2];
2645 out4f[x*4+3] = in4f[x*4+3] * c[3];
2649 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2652 int startx = span->startx;
2653 int endx = span->endx;
2658 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2659 for (x = startx;x < endx;x++)
2662 c[0] = (data[0] + slope[0]*x) * z;
2663 c[1] = (data[1] + slope[1]*x) * z;
2664 c[2] = (data[2] + slope[2]*x) * z;
2665 c[3] = (data[3] + slope[3]*x) * z;
2666 out4f[x*4+0] = c[0];
2667 out4f[x*4+1] = c[1];
2668 out4f[x*4+2] = c[2];
2669 out4f[x*4+3] = c[3];
2673 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2675 int x, startx = span->startx, endx = span->endx;
2676 float c[4], localcolor[4];
2677 localcolor[0] = subcolor[0];
2678 localcolor[1] = subcolor[1];
2679 localcolor[2] = subcolor[2];
2680 localcolor[3] = subcolor[3];
2681 for (x = startx;x < endx;x++)
2683 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2684 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2685 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2686 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2687 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2688 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2689 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2690 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2694 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2696 int x, startx = span->startx, endx = span->endx;
2697 for (x = startx;x < endx;x++)
2699 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2700 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2701 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2702 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2706 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2708 int x, startx = span->startx, endx = span->endx;
2709 for (x = startx;x < endx;x++)
2711 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2712 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2713 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2714 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2718 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2720 int x, startx = span->startx, endx = span->endx;
2722 for (x = startx;x < endx;x++)
2724 a = 1.0f - inb4f[x*4+3];
2726 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2727 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2728 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2729 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2733 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2735 int x, startx = span->startx, endx = span->endx;
2736 float localcolor[4], ilerp, lerp;
2737 localcolor[0] = color[0];
2738 localcolor[1] = color[1];
2739 localcolor[2] = color[2];
2740 localcolor[3] = color[3];
2741 ilerp = 1.0f - localcolor[3];
2742 lerp = localcolor[3];
2743 for (x = startx;x < endx;x++)
2745 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2746 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2747 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2748 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2754 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2758 int startx = span->startx;
2759 int endx = span->endx;
2761 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2762 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2763 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2764 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
2765 data = _mm_mul_ps(data, _mm_set1_ps(256.0f));
2766 slope = _mm_mul_ps(slope, _mm_set1_ps(256.0f));
2767 for (x = startx;x+2 <= endx;x += 2, data = _mm_add_ps(data, slope))
2769 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2770 __m128i mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x]))), mod2;
2771 data = _mm_add_ps(data, slope);
2772 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x+1])));
2773 mod = _mm_unpacklo_epi64(_mm_packs_epi32(mod, mod), _mm_packs_epi32(mod2, mod2));
2774 pix = _mm_mulhi_epu16(pix, mod);
2775 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2777 for (;x < endx;x++, data = _mm_add_ps(data, slope))
2779 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2780 __m128i mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x])));
2781 mod = _mm_packs_epi32(mod, mod);
2782 pix = _mm_mulhi_epu16(pix, mod);
2783 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2788 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2792 int startx = span->startx;
2793 int endx = span->endx;
2795 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2796 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2797 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2798 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
2799 data = _mm_mul_ps(data, _mm_set1_ps(255.0f));
2800 slope = _mm_mul_ps(slope, _mm_set1_ps(255.0f));
2801 for (x = startx;x+2 <= endx;x += 2, data = _mm_add_ps(data, slope))
2803 __m128i pix = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x]))), pix2;
2804 data = _mm_add_ps(data, slope);
2805 pix2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x+1])));
2806 pix = _mm_unpacklo_epi64(_mm_packs_epi32(pix, pix), _mm_packs_epi32(pix2, pix2));
2807 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2809 for (;x < endx;x++, data = _mm_add_ps(data, slope))
2811 __m128i pix = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x])));
2812 pix = _mm_packs_epi32(pix, pix);
2813 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2818 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
2821 int x, startx = span->startx, endx = span->endx;
2822 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
2823 localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
2824 for (x = startx;x+2 <= endx;x+=2)
2826 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2827 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2828 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2829 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2833 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2834 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
2835 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2836 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2841 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
2844 int x, startx = span->startx, endx = span->endx;
2845 for (x = startx;x+2 <= endx;x+=2)
2847 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2848 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
2849 pix1 = _mm_mulhi_epu16(pix1, pix2);
2850 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2854 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2855 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
2856 pix1 = _mm_mulhi_epu16(pix1, pix2);
2857 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2862 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
2865 int x, startx = span->startx, endx = span->endx;
2866 for (x = startx;x+2 <= endx;x+=2)
2868 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2869 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2870 pix1 = _mm_add_epi16(pix1, pix2);
2871 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2875 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2876 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
2877 pix1 = _mm_add_epi16(pix1, pix2);
2878 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2883 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
2886 int x, startx = span->startx, endx = span->endx;
2887 __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
2888 tint = _mm_shuffle_epi32(_mm_packs_epi32(tint, tint), _MM_SHUFFLE(1, 0, 1, 0));
2889 for (x = startx;x+2 <= endx;x+=2)
2891 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2892 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
2893 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
2894 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2898 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2899 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
2900 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
2901 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2906 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
2909 int x, startx = span->startx, endx = span->endx;
2910 for (x = startx;x+2 <= endx;x+=2)
2912 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2913 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2914 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2915 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
2916 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2920 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2921 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
2922 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
2923 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
2924 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2929 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
2932 int x, startx = span->startx, endx = span->endx;
2933 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
2934 localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
2935 blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
2936 for (x = startx;x+2 <= endx;x+=2)
2938 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
2939 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
2940 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2944 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
2945 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
2946 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2953 void DPSOFTRAST_VertexShader_Generic(void)
2955 DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
2956 DPSOFTRAST_Array_Copy(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_COLOR], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_COLOR], dpsoftrast.numvertices);
2957 DPSOFTRAST_Array_Copy(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.numvertices);
2958 if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
2959 DPSOFTRAST_Array_Copy(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1], dpsoftrast.numvertices);
2962 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
2964 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
2965 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
2966 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
2967 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
2968 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
2969 if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
2971 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
2972 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
2973 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
2975 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
2976 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
2979 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
2981 else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
2984 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
2986 else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
2989 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
2994 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
2995 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3000 void DPSOFTRAST_VertexShader_PostProcess(void)
3002 DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3003 DPSOFTRAST_Array_Copy(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.numvertices);
3004 DPSOFTRAST_Array_Copy(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1], dpsoftrast.numvertices);
3007 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3009 // TODO: optimize!! at the very least there is no reason to use texture sampling on the frame texture
3010 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3011 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3012 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3013 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3014 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3015 if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3017 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3018 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3020 DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3021 if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3023 // TODO: implement saturation
3025 if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3027 // TODO: implement gammaramps
3029 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3034 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3036 DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3039 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3041 // this is never called (because colormask is off when this shader is used)
3042 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3043 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3044 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3045 memset(buffer_FragColorbgra8, 0, span->length*4);
3046 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3051 void DPSOFTRAST_VertexShader_FlatColor(void)
3053 DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3054 DPSOFTRAST_Array_Transform(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3057 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3059 int x, startx = span->startx, endx = span->endx;
3060 int Color_Ambienti[4];
3061 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3062 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3063 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3064 Color_Ambienti[2] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0]*256.0f);
3065 Color_Ambienti[1] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1]*256.0f);
3066 Color_Ambienti[0] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2]*256.0f);
3067 Color_Ambienti[3] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0] *256.0f);
3068 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3069 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3070 for (x = startx;x < endx;x++)
3072 buffer_FragColorbgra8[x*4+0] = (buffer_texture_colorbgra8[x*4+0] * Color_Ambienti[0])>>8;
3073 buffer_FragColorbgra8[x*4+1] = (buffer_texture_colorbgra8[x*4+1] * Color_Ambienti[1])>>8;
3074 buffer_FragColorbgra8[x*4+2] = (buffer_texture_colorbgra8[x*4+2] * Color_Ambienti[2])>>8;
3075 buffer_FragColorbgra8[x*4+3] = (buffer_texture_colorbgra8[x*4+3] * Color_Ambienti[3])>>8;
3077 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3082 void DPSOFTRAST_VertexShader_VertexColor(void)
3084 DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3085 DPSOFTRAST_Array_Copy(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_COLOR], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_COLOR], dpsoftrast.numvertices);
3086 DPSOFTRAST_Array_Transform(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3089 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3092 unsigned char * RESTRICT pixelmask = span->pixelmask;
3093 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3094 int x, startx = span->startx, endx = span->endx;
3095 __m128i Color_Ambientm, Color_Diffusem;
3097 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3098 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3099 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3100 int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3101 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3102 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3103 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3104 pixel = buffer_FragColorbgra8;
3105 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3106 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3107 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3108 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3109 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3110 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3111 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3112 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3113 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3114 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3115 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3116 data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3117 slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3118 for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3120 __m128i color, mod, pix;
3121 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3124 __m128 z = _mm_loadu_ps(&buffer_z[x]);
3125 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3126 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3127 data = _mm_add_ps(data, slope);
3128 mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3129 data = _mm_add_ps(data, slope);
3130 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3131 data = _mm_add_ps(data, slope);
3132 mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3133 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3134 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3135 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3136 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3137 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3143 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3144 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x])));
3145 mod = _mm_packs_epi32(mod, mod);
3146 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3147 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3149 if(pixel == buffer_FragColorbgra8)
3150 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3156 void DPSOFTRAST_VertexShader_Lightmap(void)
3158 DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3159 DPSOFTRAST_Array_Transform(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3160 DPSOFTRAST_Array_Copy(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD4], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD4], dpsoftrast.numvertices);
3163 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3166 unsigned char * RESTRICT pixelmask = span->pixelmask;
3167 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3168 int x, startx = span->startx, endx = span->endx;
3169 __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3170 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3171 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3172 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3173 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3174 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3175 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3176 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3177 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3178 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3179 pixel = buffer_FragColorbgra8;
3180 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3181 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3182 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3183 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3184 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3185 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3186 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3187 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3189 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3190 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3191 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3192 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3193 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3194 for (x = startx;x < endx;x++)
3196 __m128i color, lightmap, glow, pix;
3197 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3200 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3201 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3202 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3203 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3204 _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3205 _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3206 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3207 _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3208 _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3209 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3215 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3216 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3217 glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3218 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3219 pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3220 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3225 for (x = startx;x < endx;x++)
3227 __m128i color, lightmap, pix;
3228 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3231 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3232 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3233 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3234 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3235 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3236 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3237 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3243 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3244 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3245 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3246 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3249 if(pixel == buffer_FragColorbgra8)
3250 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3256 void DPSOFTRAST_VertexShader_FakeLight(void)
3258 DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3261 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3264 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3265 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3266 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3267 memset(buffer_FragColorbgra8, 0, span->length*4);
3268 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3273 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3275 DPSOFTRAST_VertexShader_Lightmap();
3278 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3280 DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3286 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3288 DPSOFTRAST_VertexShader_Lightmap();
3291 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3293 DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3299 void DPSOFTRAST_VertexShader_LightDirection(void)
3302 int numvertices = dpsoftrast.numvertices;
3304 float LightVector[4];
3305 float EyePosition[4];
3306 float EyeVectorModelSpace[4];
3312 LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3313 LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3314 LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3315 LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3316 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3317 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3318 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3319 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3320 DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3321 DPSOFTRAST_Array_Transform(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3322 for (i = 0;i < numvertices;i++)
3324 position[0] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3325 position[1] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3326 position[2] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3327 svector[0] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3328 svector[1] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3329 svector[2] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3330 tvector[0] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3331 tvector[1] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3332 tvector[2] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3333 normal[0] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3334 normal[1] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3335 normal[2] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3336 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3337 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3338 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3339 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3340 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3341 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3342 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3343 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3344 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3345 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3346 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3347 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3348 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3349 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3350 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3351 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3352 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3356 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3357 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3358 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3359 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3360 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3361 #define DPSOFTRAST_Vector3Normalize(v)\
3364 float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3375 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3377 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3378 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3379 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3380 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3381 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3382 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3383 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3384 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3385 int x, startx = span->startx, endx = span->endx;
3386 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3387 float LightVectordata[4];
3388 float LightVectorslope[4];
3389 float EyeVectordata[4];
3390 float EyeVectorslope[4];
3392 float diffusetex[4];
3394 float surfacenormal[4];
3395 float lightnormal[4];
3397 float specularnormal[4];
3400 float SpecularPower;
3402 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3403 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3404 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3405 Color_Glow[3] = 0.0f;
3406 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3407 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3408 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3409 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3410 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3411 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3412 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3413 Color_Pants[3] = 0.0f;
3414 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3415 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3416 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3417 Color_Shirt[3] = 0.0f;
3418 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3419 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3420 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3422 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3423 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3425 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3427 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3429 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3431 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3432 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3433 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3434 Color_Diffuse[3] = 0.0f;
3435 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3436 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3437 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3438 LightColor[3] = 0.0f;
3439 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3440 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3441 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3442 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3443 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3444 Color_Specular[3] = 0.0f;
3445 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3446 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3447 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3448 for (x = startx;x < endx;x++)
3451 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3452 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3453 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3454 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3455 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3457 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3458 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3459 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3460 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3462 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3463 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3464 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3465 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3466 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3467 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3468 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3469 DPSOFTRAST_Vector3Normalize(surfacenormal);
3471 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3472 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3473 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3474 DPSOFTRAST_Vector3Normalize(lightnormal);
3476 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3477 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3478 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3479 DPSOFTRAST_Vector3Normalize(eyenormal);
3481 specularnormal[0] = lightnormal[0] + eyenormal[0];
3482 specularnormal[1] = lightnormal[1] + eyenormal[1];
3483 specularnormal[2] = lightnormal[2] + eyenormal[2];
3484 DPSOFTRAST_Vector3Normalize(specularnormal);
3486 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3487 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3488 specular = pow(specular, SpecularPower * glosstex[3]);
3489 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3491 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3492 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3493 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3494 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3498 d[0] = (int)( diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3499 d[1] = (int)( diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3500 d[2] = (int)( diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3501 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3503 buffer_FragColorbgra8[x*4+0] = d[0];
3504 buffer_FragColorbgra8[x*4+1] = d[1];
3505 buffer_FragColorbgra8[x*4+2] = d[2];
3506 buffer_FragColorbgra8[x*4+3] = d[3];
3509 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3511 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3512 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3513 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3514 Color_Diffuse[3] = 0.0f;
3515 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3516 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3517 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3518 LightColor[3] = 0.0f;
3519 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3520 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3521 for (x = startx;x < endx;x++)
3524 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3525 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3526 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3527 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3528 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3529 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3530 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3531 DPSOFTRAST_Vector3Normalize(surfacenormal);
3533 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3534 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3535 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3536 DPSOFTRAST_Vector3Normalize(lightnormal);
3538 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3539 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3541 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3542 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3543 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3544 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3548 d[0] = (int)( + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3549 d[1] = (int)( + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3550 d[2] = (int)( + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3551 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3553 buffer_FragColorbgra8[x*4+0] = d[0];
3554 buffer_FragColorbgra8[x*4+1] = d[1];
3555 buffer_FragColorbgra8[x*4+2] = d[2];
3556 buffer_FragColorbgra8[x*4+3] = d[3];
3561 for (x = startx;x < endx;x++)
3564 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3565 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3566 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3567 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3569 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3571 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3572 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3573 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3574 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3578 d[0] = (int)( diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3579 d[1] = (int)( diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3580 d[2] = (int)( diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3581 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3583 buffer_FragColorbgra8[x*4+0] = d[0];
3584 buffer_FragColorbgra8[x*4+1] = d[1];
3585 buffer_FragColorbgra8[x*4+2] = d[2];
3586 buffer_FragColorbgra8[x*4+3] = d[3];
3589 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3594 void DPSOFTRAST_VertexShader_LightSource(void)
3597 int numvertices = dpsoftrast.numvertices;
3598 float LightPosition[4];
3599 float LightVector[4];
3600 float LightVectorModelSpace[4];
3601 float EyePosition[4];
3602 float EyeVectorModelSpace[4];
3608 LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
3609 LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
3610 LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
3611 LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
3612 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3613 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3614 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3615 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3616 DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3617 DPSOFTRAST_Array_Transform(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3618 DPSOFTRAST_Array_Transform(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
3619 DPSOFTRAST_Array_Copy(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD4], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD4], dpsoftrast.numvertices);
3620 for (i = 0;i < numvertices;i++)
3622 position[0] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3623 position[1] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3624 position[2] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3625 svector[0] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3626 svector[1] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3627 svector[2] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3628 tvector[0] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3629 tvector[1] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3630 tvector[2] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3631 normal[0] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3632 normal[1] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3633 normal[2] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3634 LightVectorModelSpace[0] = LightPosition[0] - position[0];
3635 LightVectorModelSpace[1] = LightPosition[1] - position[1];
3636 LightVectorModelSpace[2] = LightPosition[2] - position[2];
3637 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
3638 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
3639 LightVector[2] = normal[0] * LightVectorModelSpace[0] + normal[1] * LightVectorModelSpace[1] + normal[2] * LightVectorModelSpace[2];
3640 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3641 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3642 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3643 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3644 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3645 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3646 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3647 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3648 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3649 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3650 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3651 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3652 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3653 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3657 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3660 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3661 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3662 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3663 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3664 unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3665 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3666 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3667 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3668 int x, startx = span->startx, endx = span->endx;
3669 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3670 float CubeVectordata[4];
3671 float CubeVectorslope[4];
3672 float LightVectordata[4];
3673 float LightVectorslope[4];
3674 float EyeVectordata[4];
3675 float EyeVectorslope[4];
3677 float diffusetex[4];
3679 float surfacenormal[4];
3680 float lightnormal[4];
3682 float specularnormal[4];
3685 float SpecularPower;
3686 float CubeVector[4];
3689 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3690 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3691 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3692 Color_Glow[3] = 0.0f;
3693 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3694 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3695 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3696 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3697 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3698 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3699 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3700 Color_Diffuse[3] = 0.0f;
3701 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3702 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3703 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3704 Color_Specular[3] = 0.0f;
3705 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3706 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3707 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3708 Color_Pants[3] = 0.0f;
3709 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3710 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3711 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3712 Color_Shirt[3] = 0.0f;
3713 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3714 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3715 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3716 LightColor[3] = 0.0f;
3717 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3718 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3719 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3720 DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3721 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3722 memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
3723 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3724 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3726 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3727 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3729 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3730 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
3731 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3733 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3734 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3735 for (x = startx;x < endx;x++)
3738 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3739 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3740 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3741 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3742 if (attenuation < 0.01f)
3744 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3746 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3747 if (attenuation < 0.01f)
3751 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3752 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3753 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3754 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3755 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3757 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3758 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3759 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3760 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3762 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3763 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3764 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3765 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3766 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3767 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3768 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3769 DPSOFTRAST_Vector3Normalize(surfacenormal);
3771 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3772 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3773 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3774 DPSOFTRAST_Vector3Normalize(lightnormal);
3776 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3777 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3778 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3779 DPSOFTRAST_Vector3Normalize(eyenormal);
3781 specularnormal[0] = lightnormal[0] + eyenormal[0];
3782 specularnormal[1] = lightnormal[1] + eyenormal[1];
3783 specularnormal[2] = lightnormal[2] + eyenormal[2];
3784 DPSOFTRAST_Vector3Normalize(specularnormal);
3786 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3787 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3788 specular = pow(specular, SpecularPower * glosstex[3]);
3789 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3791 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3792 attenuation *= (1.0f / 255.0f);
3793 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3794 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3795 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3796 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
3800 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
3801 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
3802 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
3803 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
3805 buffer_FragColorbgra8[x*4+0] = d[0];
3806 buffer_FragColorbgra8[x*4+1] = d[1];
3807 buffer_FragColorbgra8[x*4+2] = d[2];
3808 buffer_FragColorbgra8[x*4+3] = d[3];
3811 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3813 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3814 for (x = startx;x < endx;x++)
3817 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3818 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3819 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3820 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3821 if (attenuation < 0.01f)
3823 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3825 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3826 if (attenuation < 0.01f)
3830 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3831 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3832 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3833 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3834 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3836 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3837 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3838 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3839 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3841 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3842 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3843 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3844 DPSOFTRAST_Vector3Normalize(surfacenormal);
3846 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3847 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3848 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3849 DPSOFTRAST_Vector3Normalize(lightnormal);
3851 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3852 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3854 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3855 attenuation *= (1.0f / 255.0f);
3856 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3857 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3858 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3859 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
3863 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
3864 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
3865 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
3866 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
3868 buffer_FragColorbgra8[x*4+0] = d[0];
3869 buffer_FragColorbgra8[x*4+1] = d[1];
3870 buffer_FragColorbgra8[x*4+2] = d[2];
3871 buffer_FragColorbgra8[x*4+3] = d[3];
3876 for (x = startx;x < endx;x++)
3879 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3880 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3881 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3882 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3883 if (attenuation < 0.01f)
3885 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3887 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3888 if (attenuation < 0.01f)
3892 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3893 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3894 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3895 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3896 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3898 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3899 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3900 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3901 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3903 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3905 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3906 attenuation *= (1.0f / 255.0f);
3907 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3908 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3909 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3910 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
3914 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
3915 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
3916 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
3917 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
3919 buffer_FragColorbgra8[x*4+0] = d[0];
3920 buffer_FragColorbgra8[x*4+1] = d[1];
3921 buffer_FragColorbgra8[x*4+2] = d[2];
3922 buffer_FragColorbgra8[x*4+3] = d[3];
3925 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3931 void DPSOFTRAST_VertexShader_Refraction(void)
3933 DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3936 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3939 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3940 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3941 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3942 memset(buffer_FragColorbgra8, 0, span->length*4);
3943 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3948 void DPSOFTRAST_VertexShader_Water(void)
3950 DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3954 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3957 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3958 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3959 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3960 memset(buffer_FragColorbgra8, 0, span->length*4);
3961 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3966 void DPSOFTRAST_VertexShader_ShowDepth(void)
3968 DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3971 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3974 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3975 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3976 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3977 memset(buffer_FragColorbgra8, 0, span->length*4);
3978 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3983 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
3985 DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3988 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3991 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3992 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3993 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3994 memset(buffer_FragColorbgra8, 0, span->length*4);
3995 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4000 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4002 DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4005 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4008 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4009 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4010 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4011 memset(buffer_FragColorbgra8, 0, span->length*4);
4012 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4017 typedef struct DPSOFTRAST_ShaderModeInfo_s
4020 void (*Vertex)(void);
4021 void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4022 unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4023 unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4025 DPSOFTRAST_ShaderModeInfo;
4027 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4029 {2, DPSOFTRAST_VertexShader_Generic, DPSOFTRAST_PixelShader_Generic, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4030 {2, DPSOFTRAST_VertexShader_PostProcess, DPSOFTRAST_PixelShader_PostProcess, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4031 {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow, DPSOFTRAST_PixelShader_Depth_Or_Shadow, {~0}, {~0}},
4032 {2, DPSOFTRAST_VertexShader_FlatColor, DPSOFTRAST_PixelShader_FlatColor, {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4033 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4034 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4035 {2, DPSOFTRAST_VertexShader_FakeLight, DPSOFTRAST_PixelShader_FakeLight, {~0}, {~0}},
4036 {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace, DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4037 {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4038 {2, DPSOFTRAST_VertexShader_LightDirection, DPSOFTRAST_PixelShader_LightDirection, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4039 {2, DPSOFTRAST_VertexShader_LightSource, DPSOFTRAST_PixelShader_LightSource, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4040 {2, DPSOFTRAST_VertexShader_Refraction, DPSOFTRAST_PixelShader_Refraction, {~0}},
4041 {2, DPSOFTRAST_VertexShader_Water, DPSOFTRAST_PixelShader_Water, {~0}},
4042 {2, DPSOFTRAST_VertexShader_ShowDepth, DPSOFTRAST_PixelShader_ShowDepth, {~0}},
4043 {2, DPSOFTRAST_VertexShader_DeferredGeometry, DPSOFTRAST_PixelShader_DeferredGeometry, {~0}},
4044 {2, DPSOFTRAST_VertexShader_DeferredLightSource, DPSOFTRAST_PixelShader_DeferredLightSource, {~0}}
4048 int DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int commandoffset, int endoffset)
4050 while (commandoffset != endoffset)
4052 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
4053 switch (command->opcode)
4055 #define INTERPCOMMAND(name) \
4056 case DPSOFTRAST_OPCODE_##name : \
4057 DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
4058 commandoffset += sizeof( DPSOFTRAST_Command_##name ) + ((COMMAND_SIZE - (sizeof( DPSOFTRAST_Command_##name )&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)); \
4059 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
4060 commandoffset = 0; \
4062 INTERPCOMMAND(Viewport)
4063 INTERPCOMMAND(ClearColor)
4064 INTERPCOMMAND(ClearDepth)
4065 INTERPCOMMAND(ColorMask)
4066 INTERPCOMMAND(DepthTest)
4067 INTERPCOMMAND(ScissorTest)
4068 INTERPCOMMAND(Scissor)
4069 INTERPCOMMAND(BlendFunc)
4070 INTERPCOMMAND(BlendSubtract)
4071 INTERPCOMMAND(DepthMask)
4072 INTERPCOMMAND(DepthFunc)
4073 INTERPCOMMAND(DepthRange)
4074 INTERPCOMMAND(PolygonOffset)
4075 INTERPCOMMAND(AlphaTest)
4076 INTERPCOMMAND(AlphaFunc)
4077 INTERPCOMMAND(SetTexture)
4078 INTERPCOMMAND(SetShader)
4079 INTERPCOMMAND(Uniform4f)
4080 INTERPCOMMAND(UniformMatrix4f)
4081 INTERPCOMMAND(Uniform1i)
4083 case DPSOFTRAST_OPCODE_Reset:
4088 return commandoffset;
4091 int DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread, int commandoffset)
4098 // unsigned int *colorpixel;
4099 unsigned int *depthpixel;
4105 DPSOFTRAST_State_Triangle *triangle;
4106 DPSOFTRAST_State_Span *span;
4107 unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4108 for (i = 0; i < thread->numspans; i++)
4110 span = &thread->spans[i];
4111 triangle = &dpsoftrast.trianglepool.triangles[span->triangle];
4112 if (commandoffset != triangle->commandoffset)
4114 commandoffset = DPSOFTRAST_Draw_InterpretCommands(thread, commandoffset, triangle->commandoffset);
4115 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4117 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4119 wslope = triangle->w[0];
4120 w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4121 depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4122 depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4123 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4124 switch(thread->fb_depthfunc)
4127 case GL_ALWAYS: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = true; break;
4128 case GL_LESS: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4129 case GL_LEQUAL: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4130 case GL_EQUAL: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4131 case GL_GEQUAL: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4132 case GL_GREATER: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4133 case GL_NEVER: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = false; break;
4135 //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4136 //for (x = 0;x < span->length;x++)
4137 // colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4138 // if there is no color buffer, skip pixel shader
4140 endx = span->length;
4141 while (startx < endx && !pixelmask[startx])
4143 while (endx > startx && !pixelmask[endx-1])
4146 continue; // no pixels to fill
4147 span->pixelmask = pixelmask;
4148 span->startx = startx;
4150 // run pixel shader if appropriate
4151 // do this before running depthmask code, to allow the pixelshader
4152 // to clear pixelmask values for alpha testing
4153 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4154 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4155 if (thread->depthmask)
4156 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4162 // no depth testing means we're just dealing with color...
4163 // if there is no color buffer, skip pixel shader
4164 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4166 memset(pixelmask, 1, span->length);
4167 span->pixelmask = pixelmask;
4169 span->endx = span->length;
4170 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4174 thread->numspans = 0;
4175 return commandoffset;
4178 void DPSOFTRAST_Draw_GenerateSpans(DPSOFTRAST_State_Thread *thread, int freetriangle)
4181 int miny = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
4182 int maxy = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
4183 int commandoffset = thread->commandoffset;
4184 int triangleoffset = thread->triangleoffset;
4185 DPSOFTRAST_State_Triangle *triangle = NULL;
4192 while (triangleoffset != freetriangle)
4194 triangle = &dpsoftrast.trianglepool.triangles[triangleoffset];
4195 if (++triangleoffset >= DPSOFTRAST_DRAW_MAXTRIANGLEPOOL)
4197 starty = triangle->starty + 1;
4198 endy = triangle->endy;
4199 if (starty >= maxy || endy <= miny)
4201 numpoints = triangle->numpoints;
4202 coords[0] = _mm_load_ps(triangle->coords[0]);
4203 coords[1] = _mm_load_ps(triangle->coords[1]);
4204 coords[2] = _mm_load_ps(triangle->coords[2]);
4205 coords[3] = _mm_load_ps(triangle->coords[3]);
4206 ycoords = _mm_load_si128((const __m128i *)triangle->ycoords);
4211 for (y = starty; y < endy;)
4213 __m128 xcoords, xslope;
4214 __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), ycoords);
4215 int yccmask = _mm_movemask_epi8(ycc);
4216 int edge0p, edge0n, edge1p, edge1n;
4223 case 0xFFFF: /*0000*/ y = endy; continue;
4224 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
4225 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4226 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
4227 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
4228 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
4229 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
4230 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
4231 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
4232 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
4233 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
4234 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
4235 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
4236 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4237 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
4238 case 0x0000: /*1111*/ y++; continue;
4246 case 0xFFFF: /*000*/ y = endy; continue;
4247 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
4248 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4249 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
4250 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
4251 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4252 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
4253 case 0x0000: /*111*/ y++; continue;
4256 ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), ycoords);
4257 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
4258 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
4259 nexty = _mm_extract_epi16(ycc, 0);
4260 if(nexty >= endy) nexty = endy-1;
4261 if (_mm_ucomigt_ss(_mm_max_ss(coords[edge0n], coords[edge0p]), _mm_min_ss(coords[edge1n], coords[edge1p])))
4270 xslope = _mm_sub_ps(_mm_movelh_ps(coords[edge0n], coords[edge1n]), _mm_movelh_ps(coords[edge0p], coords[edge1p]));
4271 xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
4272 xcoords = _mm_add_ps(_mm_movelh_ps(coords[edge0p], coords[edge1p]),
4273 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(coords[edge0p], coords[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
4274 xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
4275 for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
4277 int startx, endx, offset;
4278 startx = _mm_cvtss_si32(xcoords);
4279 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
4280 if (startx < 0) startx = 0;
4281 if (endx > dpsoftrast.fb_width) endx = dpsoftrast.fb_width;
4282 if (startx >= endx) continue;
4283 for (offset = startx; offset < endx;)
4285 DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
4286 span->triangle = (int)(triangle - dpsoftrast.trianglepool.triangles);
4289 span->length = endx - offset;
4290 if (span -> length > DPSOFTRAST_DRAW_MAXSPANLENGTH)
4291 span -> length = DPSOFTRAST_DRAW_MAXSPANLENGTH;
4292 offset += span->length;
4293 if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
4294 commandoffset = DPSOFTRAST_Draw_ProcessSpans(thread, commandoffset);
4300 if (thread->numspans > 0)
4301 commandoffset = DPSOFTRAST_Draw_ProcessSpans(thread, commandoffset);
4302 if (commandoffset != triangle->commandoffset)
4304 commandoffset = DPSOFTRAST_Draw_InterpretCommands(thread, commandoffset, triangle->commandoffset);
4305 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4310 thread->commandoffset = commandoffset;
4311 thread->triangleoffset = triangleoffset;
4315 void DPSOFTRAST_Draw_FlushThreads(void)
4317 DPSOFTRAST_State_Thread *thread;
4319 if(dpsoftrast.drawtriangle != dpsoftrast.trianglepool.freetriangle)
4322 dpsoftrast.drawtriangle = dpsoftrast.trianglepool.freetriangle;
4325 SDL_LockMutex(dpsoftrast.trianglemutex);
4327 for (i = 0; i < dpsoftrast.numthreads; i++)
4329 thread = &dpsoftrast.threads[i];
4331 while (thread->triangleoffset != dpsoftrast.drawtriangle)
4333 thread->waiting = true;
4334 SDL_CondBroadcast(dpsoftrast.trianglecond);
4335 SDL_CondWait(thread->waitcond, dpsoftrast.trianglemutex);
4336 thread->waiting = false;
4339 if (thread->triangleoffset != dpsoftrast.drawtriangle)
4340 DPSOFTRAST_Draw_GenerateSpans(thread, dpsoftrast.drawtriangle);
4344 SDL_UnlockMutex(dpsoftrast.trianglemutex);
4346 dpsoftrast.trianglepool.usedtriangles = 0;
4347 dpsoftrast.commandpool.usedcommands = 0;
4351 static int DPSOFTRAST_Draw_Thread(void *data)
4353 DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
4354 while(thread->index >= 0)
4356 if (thread->triangleoffset != dpsoftrast.drawtriangle)
4358 DPSOFTRAST_Draw_GenerateSpans(thread, dpsoftrast.drawtriangle);
4362 SDL_LockMutex(dpsoftrast.trianglemutex);
4363 if (thread->triangleoffset != dpsoftrast.drawtriangle)
4365 SDL_UnlockMutex(dpsoftrast.trianglemutex);
4368 if (thread->waiting) SDL_CondSignal(thread->waitcond);
4369 SDL_CondWait(dpsoftrast.trianglecond, dpsoftrast.trianglemutex);
4370 SDL_UnlockMutex(dpsoftrast.trianglemutex);
4377 void DPSOFTRAST_Draw_ProcessTriangles(int firstvertex, int numtriangles, const int *element3i, const unsigned short *element3s, unsigned char *arraymask, int numarrays)
4380 int cullface = dpsoftrast.cullface;
4381 int width = dpsoftrast.fb_width;
4382 int height = dpsoftrast.fb_height;
4383 __m128i fbmax = _mm_sub_epi16(_mm_setr_epi16(width, height, width, height, width, height, width, height), _mm_set1_epi16(1));
4384 DPSOFTRAST_State_Triangle *triangle;
4396 __m128 triangleedge1, triangleedge2, trianglenormal;
4399 DPSOFTRAST_Texture *texture;
4400 screen[3] = _mm_setzero_ps();
4401 clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4402 for (i = 0;i < numtriangles;i++)
4404 // generate the 3 edges of this triangle
4405 // generate spans for the triangle - switch based on left split or right split classification of triangle
4408 e[0] = element3i[i*3+0] - firstvertex;
4409 e[1] = element3i[i*3+1] - firstvertex;
4410 e[2] = element3i[i*3+2] - firstvertex;
4414 e[0] = element3s[i*3+0] - firstvertex;
4415 e[1] = element3s[i*3+1] - firstvertex;
4416 e[2] = element3s[i*3+2] - firstvertex;
4425 #define SKIPBACKFACE \
4426 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4427 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4428 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4429 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4430 _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4434 if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4438 if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4442 //trianglenormal = _mm_sub_ps(_mm_mul_ps(triangleedge[0], _mm_shuffle_ps(triangleedge[1], triangleedge[1], _MM_SHUFFLE(3, 0, 2, 1))),
4443 // _mm_mul_ps(_mm_shuffle_ps(triangleedge[0], triangleedge[0], _MM_SHUFFLE(3, 0, 2, 1)), triangleedge[1]));
4444 //trianglenormal[2] = triangleedge[0][0] * triangleedge[1][1] - triangleedge[0][1] * triangleedge[1][0];
4445 //trianglenormal[0] = triangleedge[0][1] * triangleedge[1][2] - triangleedge[0][2] * triangleedge[1][1];
4446 //trianglenormal[1] = triangleedge[0][2] * triangleedge[1][0] - triangleedge[0][0] * triangleedge[1][2];
4448 // macros for clipping vertices
4450 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4451 clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4453 __m128 v1 = _mm_load_ps(&dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[p1]*4]), v2 = _mm_load_ps(&dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[p2]*4]); \
4454 screen[k] = DPSOFTRAST_Draw_ProjectVertex(_mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1]))); \
4456 #define CLIPPEDVERTEXCOPY(k,p1) \
4457 screen[k] = _mm_load_ps(&dpsoftrast.screencoord4f[e[p1]*4]);
4459 #define GENATTRIBCOPY(j, attrib, p1) \
4460 attrib = _mm_load_ps(&dpsoftrast.post_array4f[j][e[p1]*4]);
4461 #define GENATTRIBLERP(j, attrib, p1, p2) \
4463 __m128 v1 = _mm_load_ps(&dpsoftrast.post_array4f[j][e[p1]*4]), v2 = _mm_load_ps(&dpsoftrast.post_array4f[j][e[p2]*4]); \
4464 attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4466 #define GENATTRIBS(j, attrib0, attrib1, attrib2) \
4470 case 0: GENATTRIBCOPY(j, attrib0, 0); GENATTRIBCOPY(j, attrib1, 1); GENATTRIBCOPY(j, attrib2, 2); break; \
4471 case 1: GENATTRIBCOPY(j, attrib0, 0); GENATTRIBCOPY(j, attrib1, 1); GENATTRIBLERP(j, attrib2, 1, 2); break; \
4472 case 2: GENATTRIBCOPY(j, attrib0, 0); GENATTRIBLERP(j, attrib1, 0, 1); GENATTRIBLERP(j, attrib2, 1, 2); break; \
4473 case 3: GENATTRIBCOPY(j, attrib0, 0); GENATTRIBLERP(j, attrib1, 0, 1); GENATTRIBLERP(j, attrib2, 2, 0); break; \
4474 case 4: GENATTRIBLERP(j, attrib0, 0, 1); GENATTRIBCOPY(j, attrib1, 1); GENATTRIBCOPY(j, attrib2, 2); break; \
4475 case 5: GENATTRIBLERP(j, attrib0, 0, 1); GENATTRIBCOPY(j, attrib1, 1); GENATTRIBLERP(j, attrib2, 1, 2); break; \
4476 case 6: GENATTRIBLERP(j, attrib0, 1, 2); GENATTRIBCOPY(j, attrib1, 2); GENATTRIBLERP(j, attrib2, 2, 0); break; \
4479 // calculate distance from nearplane
4480 clipdist[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[0]*4+2] + dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[0]*4+3];
4481 clipdist[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[1]*4+2] + dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[1]*4+3];
4482 clipdist[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[2]*4+2] + dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[2]*4+3];
4483 if (clipdist[0] >= 0.0f)
4485 if (clipdist[1] >= 0.0f)
4487 if (clipdist[2] >= 0.0f)
4489 // triangle is entirely in front of nearplane
4490 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4497 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4505 if (clipdist[2] >= 0.0f)
4507 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4514 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4521 else if (clipdist[1] >= 0.0f)
4523 if (clipdist[2] >= 0.0f)
4525 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4532 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4538 else if (clipdist[2] >= 0.0f)
4540 CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4545 else continue; // triangle is entirely behind nearplane
4548 // calculate integer y coords for triangle points
4549 __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4550 screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4551 screenmin = _mm_min_epi16(screeni, screenir),
4552 screenmax = _mm_max_epi16(screeni, screenir);
4553 screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4554 screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4555 screenmin = _mm_max_epi16(screenmin, _mm_setzero_si128());
4556 screenmax = _mm_min_epi16(screenmax, fbmax);
4557 // skip offscreen triangles
4558 if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4560 starty = _mm_extract_epi16(screenmin, 1);
4561 endy = _mm_extract_epi16(screenmax, 1)+1;
4562 screeny = _mm_srai_epi32(screeni, 16);
4565 if (dpsoftrast.trianglepool.usedtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLEPOOL-1)
4567 DPSOFTRAST_Draw_FreeTrianglePool(DPSOFTRAST_DRAW_MAXTRIANGLEPOOL/8);
4569 DPSOFTRAST_Draw_FlushThreads();
4572 triangle = &dpsoftrast.trianglepool.triangles[dpsoftrast.trianglepool.freetriangle];
4573 triangle->commandoffset = dpsoftrast.commandpool.freecommand;
4574 triangle->starty = starty;
4575 triangle->endy = endy;
4576 triangle->numpoints = numpoints;
4577 _mm_store_ps(triangle->coords[0], screen[0]);
4578 _mm_store_ps(triangle->coords[1], screen[1]);
4579 _mm_store_ps(triangle->coords[2], screen[2]);
4580 _mm_store_ps(triangle->coords[3], numpoints > 3 ? screen[3] : screen[2]);
4581 _mm_store_si128((__m128i *)triangle->ycoords, screeny);
4583 // calculate attribute plans for triangle data...
4584 // okay, this triangle is going to produce spans, we'd better project
4585 // the interpolants now (this is what gives perspective texturing),
4586 // this consists of simply multiplying all arrays by the W coord
4587 // (which is basically 1/Z), which will be undone per-pixel
4588 // (multiplying by Z again) to get the perspective-correct array
4591 __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4592 attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4593 attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4594 attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4595 attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4596 attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4597 w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4598 w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4599 w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4600 attribedge1 = _mm_sub_ss(w0, w1);
4601 attribedge2 = _mm_sub_ss(w2, w1);
4602 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4603 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4604 x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4605 y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4606 attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4607 _mm_store_ss(&triangle->w[0], attribxslope);
4608 _mm_store_ss(&triangle->w[1], attribyslope);
4609 _mm_store_ss(&triangle->w[2], attriborigin);
4610 for (j = 0;j < numarrays;j++)
4614 __m128 attrib0, attrib1, attrib2;
4615 GENATTRIBS(j, attrib0, attrib1, attrib2);
4616 attriborigin = _mm_mul_ps(attrib1, w1);
4617 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4618 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4619 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4620 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4621 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4622 _mm_stream_ps(triangle->attribs[j][0], attribxslope);
4623 _mm_stream_ps(triangle->attribs[j][1], attribyslope);
4624 _mm_stream_ps(triangle->attribs[j][2], attriborigin);
4629 // adjust texture LOD by texture density, in the simplest way possible...
4631 __m128 mipedgescale, mipedgetc, mipdensity, attrib0, attrib1, attrib2;
4632 memset(triangle->mip, 0, sizeof(triangle->mip));
4633 mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4634 mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4635 mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4636 k = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].lodarrayindex;
4637 GENATTRIBS(k, attrib0, attrib1, attrib2);
4638 mipedgetc = _mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1));
4639 mipedgetc = _mm_mul_ps(mipedgetc, mipedgescale);
4640 for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4642 int texunit = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].texunits[j];
4643 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4645 texture = dpsoftrast.texbound[texunit];
4646 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4648 mipdensity = _mm_mul_ps(mipedgetc, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4649 mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4650 mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4651 mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4652 // this will be multiplied in the texturing routine by the texture resolution
4653 y = _mm_cvtss_si32(mipdensity);
4656 y = (int)(log((float)y)*0.5f/M_LN2);
4657 if (y > texture->mipmaps - 1)
4658 y = texture->mipmaps - 1;
4659 triangle->mip[texunit] = y;
4665 dpsoftrast.trianglepool.freetriangle = dpsoftrast.trianglepool.freetriangle < DPSOFTRAST_DRAW_MAXTRIANGLEPOOL-1 ? dpsoftrast.trianglepool.freetriangle + 1 : 0;
4666 dpsoftrast.trianglepool.usedtriangles++;
4669 if (numqueued >= DPSOFTRAST_DRAW_FLUSHPROCESSTRIANGLES)
4672 dpsoftrast.drawtriangle = dpsoftrast.trianglepool.freetriangle;
4675 SDL_LockMutex(dpsoftrast.trianglemutex);
4676 SDL_CondBroadcast(dpsoftrast.trianglecond);
4677 SDL_UnlockMutex(dpsoftrast.trianglemutex);
4679 DPSOFTRAST_Draw_FlushThreads();
4687 dpsoftrast.drawtriangle = dpsoftrast.trianglepool.freetriangle;
4690 SDL_LockMutex(dpsoftrast.trianglemutex);
4691 SDL_CondBroadcast(dpsoftrast.trianglecond);
4692 SDL_UnlockMutex(dpsoftrast.trianglemutex);
4694 DPSOFTRAST_Draw_FlushThreads();
4700 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4703 int lastarray = DPSOFTRAST_ARRAY_POSITION;
4704 unsigned char arraymask[DPSOFTRAST_ARRAY_TOTAL];
4705 memset(arraymask, false, sizeof(arraymask));
4706 arraymask[DPSOFTRAST_ARRAY_POSITION] = true;
4707 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4709 int arrayindex = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4710 if (arrayindex >= DPSOFTRAST_ARRAY_TOTAL)
4714 case DPSOFTRAST_ARRAY_POSITION:
4715 case DPSOFTRAST_ARRAY_COLOR:
4718 if (dpsoftrast.pointer_texcoordf[arrayindex-DPSOFTRAST_ARRAY_TEXCOORD0] == NULL)
4722 arraymask[arrayindex] = true;
4723 if (arrayindex > lastarray)
4724 lastarray = arrayindex;
4726 DPSOFTRAST_Draw_LoadVertices(firstvertex, numvertices, arraymask[DPSOFTRAST_ARRAY_COLOR]);
4727 DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
4728 // DPSOFTRAST_Draw_ProjectVertices(dpsoftrast.screencoord4f, dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], numvertices);
4729 DPSOFTRAST_Draw_ProcessTriangles(firstvertex, numtriangles, element3i, element3s, arraymask, lastarray+1);
4732 void DPSOFTRAST_Flush(void)
4734 DPSOFTRAST_Draw_SyncCommands();
4735 DPSOFTRAST_Draw_FlushThreads();
4738 void DPSOFTRAST_Init(int width, int height, int numthreads, unsigned int *colorpixels, unsigned int *depthpixels)
4748 memset(&dpsoftrast, 0, sizeof(dpsoftrast));
4749 dpsoftrast.bigendian = u.b[3];
4750 dpsoftrast.fb_width = width;
4751 dpsoftrast.fb_height = height;
4752 dpsoftrast.fb_depthpixels = depthpixels;
4753 dpsoftrast.fb_colorpixels[0] = colorpixels;
4754 dpsoftrast.fb_colorpixels[1] = NULL;
4755 dpsoftrast.fb_colorpixels[1] = NULL;
4756 dpsoftrast.fb_colorpixels[1] = NULL;
4757 dpsoftrast.texture_firstfree = 1;
4758 dpsoftrast.texture_end = 1;
4759 dpsoftrast.texture_max = 0;
4760 dpsoftrast.viewport[0] = 0;
4761 dpsoftrast.viewport[1] = 0;
4762 dpsoftrast.viewport[2] = dpsoftrast.fb_width;
4763 dpsoftrast.viewport[3] = dpsoftrast.fb_height;
4764 dpsoftrast.color[0] = 1;
4765 dpsoftrast.color[1] = 1;
4766 dpsoftrast.color[2] = 1;
4767 dpsoftrast.color[3] = 1;
4768 dpsoftrast.cullface = GL_BACK;
4770 dpsoftrast.numthreads = bound(1, numthreads, 64);
4771 dpsoftrast.trianglemutex = SDL_CreateMutex();
4772 dpsoftrast.trianglecond = SDL_CreateCond();
4774 dpsoftrast.numthreads = 1;
4776 dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
4777 for (i = 0; i < dpsoftrast.numthreads; i++)
4779 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
4781 thread->colormask[1] = 1;
4782 thread->colormask[2] = 1;
4783 thread->colormask[3] = 1;
4784 thread->blendfunc[0] = GL_ONE;
4785 thread->blendfunc[1] = GL_ZERO;
4786 thread->depthmask = true;
4787 thread->depthtest = true;
4788 thread->depthfunc = GL_LEQUAL;
4789 thread->scissortest = false;
4790 thread->alphatest = false;
4791 thread->alphafunc = GL_GREATER;
4792 thread->alphavalue = 0.5f;
4793 thread->scissor[0] = 0;
4794 thread->scissor[1] = 0;
4795 thread->scissor[2] = dpsoftrast.fb_width;
4796 thread->scissor[3] = dpsoftrast.fb_height;
4797 thread->depthrange[0] = 0;
4798 thread->depthrange[1] = 1;
4799 thread->polygonoffset[0] = 0;
4800 thread->polygonoffset[1] = 0;
4802 thread->numspans = 0;
4803 thread->triangleoffset = 0;
4804 thread->commandoffset = 0;
4805 thread->waiting = false;
4807 thread->waitcond = SDL_CreateCond();
4810 thread->validate = -1;
4811 DPSOFTRAST_Validate(thread, -1);
4813 thread->thread = SDL_CreateThread(DPSOFTRAST_Draw_Thread, thread);
4818 void DPSOFTRAST_Shutdown(void)
4822 if(dpsoftrast.numthreads > 0)
4824 DPSOFTRAST_State_Thread *thread;
4825 SDL_LockMutex(dpsoftrast.trianglemutex);
4826 for (i = 0; i < dpsoftrast.numthreads; i++)
4828 thread = &dpsoftrast.threads[i];
4831 SDL_CondBroadcast(dpsoftrast.trianglecond);
4832 SDL_UnlockMutex(dpsoftrast.trianglemutex);
4833 for (i = 0; i < dpsoftrast.numthreads; i++)
4835 thread = &dpsoftrast.threads[i];
4836 SDL_WaitThread(thread->thread, NULL);
4837 SDL_DestroyCond(thread->waitcond);
4839 SDL_DestroyMutex(dpsoftrast.trianglemutex);
4840 SDL_DestroyCond(dpsoftrast.trianglecond);
4843 for (i = 0;i < dpsoftrast.texture_end;i++)
4844 if (dpsoftrast.texture[i].bytes)
4845 MM_FREE(dpsoftrast.texture[i].bytes);
4846 if (dpsoftrast.texture)
4847 free(dpsoftrast.texture);
4848 if (dpsoftrast.threads)
4849 MM_FREE(dpsoftrast.threads);
4850 memset(&dpsoftrast, 0, sizeof(dpsoftrast));