3 #define _USE_MATH_DEFINES
6 #include "dpsoftrast.h"
13 typedef qboolean bool;
17 #define ATOMIC_SIZE 32
21 #define ALIGN(var) var __attribute__((__aligned__(16)))
22 #define ATOMIC(var) var __attribute__((__aligned__(32)))
24 #define MEMORY_BARRIER (_mm_sfence())
25 //(__sync_synchronize())
26 #define ATOMIC_COUNTER volatile int
27 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
28 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
29 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
31 #elif defined(_MSC_VER)
32 #define ALIGN(var) __declspec(align(16)) var
33 #define ATOMIC(var) __declspec(align(32)) var
35 #define MEMORY_BARRIER (_mm_sfence())
37 #define ATOMIC_COUNTER volatile LONG
38 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
39 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
40 #define ATOMIC_ADD(counter, val) (InterlockedExchangeAdd(&(counter), (val)))
49 #define ALIGN(var) var
50 #define ATOMIC(var) var
55 #include <SDL_thread.h>
57 #define MEMORY_BARRIER ((void)0)
58 #define ATOMIC_COUNTER int
59 #define ATOMIC_INCREMENT(counter) (++(counter))
60 #define ATOMIC_DECREMENT(counter) (--(counter))
61 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
62 typedef void SDL_Thread;
63 typedef void SDL_cond;
64 typedef void SDL_mutex;
68 #include <emmintrin.h>
70 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
72 static void *MM_CALLOC(size_t nmemb, size_t size)
74 void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
75 if(ptr != NULL) memset(ptr, 0, nmemb*size);
79 #define MM_FREE _mm_free
81 #define MM_MALLOC(size) malloc(size)
82 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
86 typedef enum DPSOFTRAST_ARRAY_e
88 DPSOFTRAST_ARRAY_POSITION,
89 DPSOFTRAST_ARRAY_COLOR,
90 DPSOFTRAST_ARRAY_TEXCOORD0,
91 DPSOFTRAST_ARRAY_TEXCOORD1,
92 DPSOFTRAST_ARRAY_TEXCOORD2,
93 DPSOFTRAST_ARRAY_TEXCOORD3,
94 DPSOFTRAST_ARRAY_TEXCOORD4,
95 DPSOFTRAST_ARRAY_TEXCOORD5,
96 DPSOFTRAST_ARRAY_TEXCOORD6,
97 DPSOFTRAST_ARRAY_TEXCOORD7,
98 DPSOFTRAST_ARRAY_TOTAL
102 typedef struct DPSOFTRAST_Texture_s
109 DPSOFTRAST_TEXTURE_FILTER filter;
112 ATOMIC_COUNTER binds;
113 unsigned char *bytes;
114 int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
118 #define COMMAND_SIZE ALIGN_SIZE
119 #define COMMAND_ALIGN(var) ALIGN(var)
121 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
123 unsigned char opcode;
124 unsigned short commandsize;
128 enum { DPSOFTRAST_OPCODE_Reset = 0 };
130 #define DEFCOMMAND(opcodeval, name, fields) \
131 enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
132 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
134 unsigned char opcode; \
135 unsigned short commandsize; \
137 } DPSOFTRAST_Command_##name );
139 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
140 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
142 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
146 ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
148 DPSOFTRAST_State_Command_Pool);
150 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
152 unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
154 ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
156 DPSOFTRAST_State_Triangle);
158 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
159 slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
160 data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
161 _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
162 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
164 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
165 slope[0] = (triangle)->attribs[arrayindex][0][0]; \
166 slope[1] = (triangle)->attribs[arrayindex][0][1]; \
167 slope[2] = (triangle)->attribs[arrayindex][0][2]; \
168 slope[3] = (triangle)->attribs[arrayindex][0][3]; \
169 data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
170 data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
171 data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
172 data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
175 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
177 typedef ALIGN(struct DPSOFTRAST_State_Span_s
179 int triangle; // triangle this span was generated by
180 int x; // framebuffer x coord
181 int y; // framebuffer y coord
182 int length; // pixel count
183 int startx; // usable range (according to pixelmask)
184 int endx; // usable range (according to pixelmask)
185 unsigned char *pixelmask; // true for pixels that passed depth test, false for others
187 DPSOFTRAST_State_Span);
189 #define DPSOFTRAST_DRAW_MAXSPANS 1024
190 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
192 #define DPSOFTRAST_VALIDATE_FB 1
193 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
194 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
195 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
197 typedef enum DPSOFTRAST_BLENDMODE_e
199 DPSOFTRAST_BLENDMODE_OPAQUE,
200 DPSOFTRAST_BLENDMODE_ALPHA,
201 DPSOFTRAST_BLENDMODE_ADDALPHA,
202 DPSOFTRAST_BLENDMODE_ADD,
203 DPSOFTRAST_BLENDMODE_INVMOD,
204 DPSOFTRAST_BLENDMODE_MUL,
205 DPSOFTRAST_BLENDMODE_MUL2,
206 DPSOFTRAST_BLENDMODE_SUBALPHA,
207 DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
208 DPSOFTRAST_BLENDMODE_TOTAL
210 DPSOFTRAST_BLENDMODE;
212 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
231 float polygonoffset[2];
234 int shader_permutation;
236 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
238 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
239 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
241 // DPSOFTRAST_VALIDATE_ flags
244 // derived values (DPSOFTRAST_VALIDATE_FB)
246 int fb_clearscissor[4];
247 ALIGN(float fb_viewportcenter[4]);
248 ALIGN(float fb_viewportscale[4]);
250 // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
253 // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
256 ATOMIC(volatile int commandoffset);
258 volatile bool waiting;
259 volatile bool starving;
262 SDL_mutex *drawmutex;
266 DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
267 DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
269 DPSOFTRAST_State_Thread);
271 typedef ATOMIC(struct DPSOFTRAST_State_s
275 unsigned int *fb_depthpixels;
276 unsigned int *fb_colorpixels[4];
279 ALIGN(float fb_viewportcenter[4]);
280 ALIGN(float fb_viewportscale[4]);
283 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
284 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
286 const float *pointer_vertex3f;
287 const float *pointer_color4f;
288 const unsigned char *pointer_color4ub;
289 const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
292 int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
293 int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
294 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
298 float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
299 float *screencoord4f;
305 int shader_permutation;
309 int texture_firstfree;
310 DPSOFTRAST_Texture *texture;
315 const char *errorstring;
318 DPSOFTRAST_State_Thread *threads;
320 ATOMIC(volatile int drawcommand);
322 DPSOFTRAST_State_Command_Pool commandpool;
326 DPSOFTRAST_State dpsoftrast;
328 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
329 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
330 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
331 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
332 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
334 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
336 fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
337 fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
338 fb_viewportcenter[3] = 0.5f;
339 fb_viewportcenter[0] = 0.0f;
340 fb_viewportscale[1] = 0.5f * viewport[2];
341 fb_viewportscale[2] = -0.5f * viewport[3];
342 fb_viewportscale[3] = 0.5f;
343 fb_viewportscale[0] = 1.0f;
346 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
348 // calculate framebuffer scissor, viewport, viewport clipped by scissor,
349 // and viewport projection values
352 x1 = thread->scissor[0];
353 x2 = thread->scissor[0] + thread->scissor[2];
354 y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
355 y2 = dpsoftrast.fb_height - thread->scissor[1];
356 if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
358 if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
360 if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
361 thread->fb_clearscissor[0] = x1;
362 thread->fb_clearscissor[1] = y1;
363 thread->fb_clearscissor[2] = x2 - x1;
364 thread->fb_clearscissor[3] = y2 - y1;
366 DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
369 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
371 thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
374 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
376 if (thread->blendsubtract)
378 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
380 #define BLENDFUNC(sfactor, dfactor, blendmode) \
381 case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
382 BLENDFUNC(GL_SRC_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
383 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
388 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
390 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
391 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
392 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
393 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
394 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
395 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
396 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
397 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
398 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
399 BLENDFUNC(GL_SRC_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
400 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
405 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
407 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
409 mask &= thread->validate;
412 if (mask & DPSOFTRAST_VALIDATE_FB)
414 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
415 DPSOFTRAST_RecalcFB(thread);
417 if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
419 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
420 DPSOFTRAST_RecalcDepthFunc(thread);
422 if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
424 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
425 DPSOFTRAST_RecalcBlendFunc(thread);
429 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
431 if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
432 return &dpsoftrast.texture[index];
436 static void DPSOFTRAST_Texture_Grow(void)
438 DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
439 DPSOFTRAST_State_Thread *thread;
443 // expand texture array as needed
444 if (dpsoftrast.texture_max < 1024)
445 dpsoftrast.texture_max = 1024;
447 dpsoftrast.texture_max *= 2;
448 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
449 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
450 if(dpsoftrast.texbound[i])
451 dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
452 for (j = 0; j < dpsoftrast.numthreads; j++)
454 thread = &dpsoftrast.threads[j];
455 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
456 if(thread->texbound[i])
457 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
461 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
470 int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
471 int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
472 DPSOFTRAST_Texture *texture;
473 if (width*height*depth < 1)
475 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
478 if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
480 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
485 case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
486 case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
487 case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
489 case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
490 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
492 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
497 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
500 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
502 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
507 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
509 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
512 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
514 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
517 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
519 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
522 if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
524 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
527 if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
529 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
532 // find first empty slot in texture array
533 for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
534 if (!dpsoftrast.texture[texnum].bytes)
536 dpsoftrast.texture_firstfree = texnum + 1;
537 if (dpsoftrast.texture_max <= texnum)
538 DPSOFTRAST_Texture_Grow();
539 if (dpsoftrast.texture_end <= texnum)
540 dpsoftrast.texture_end = texnum + 1;
541 texture = &dpsoftrast.texture[texnum];
542 memset(texture, 0, sizeof(*texture));
543 texture->flags = flags;
544 texture->width = width;
545 texture->height = height;
546 texture->depth = depth;
547 texture->sides = sides;
559 s = w * h * d * sides * 4;
560 texture->mipmap[mipmaps][0] = size;
561 texture->mipmap[mipmaps][1] = s;
562 texture->mipmap[mipmaps][2] = w;
563 texture->mipmap[mipmaps][3] = h;
564 texture->mipmap[mipmaps][4] = d;
567 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
573 texture->mipmaps = mipmaps;
574 texture->size = size;
576 // allocate the pixels now
577 texture->bytes = (unsigned char *)MM_CALLOC(1, size);
581 void DPSOFTRAST_Texture_Free(int index)
583 DPSOFTRAST_Texture *texture;
584 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
588 MM_FREE(texture->bytes);
589 texture->bytes = NULL;
590 memset(texture, 0, sizeof(*texture));
591 // adjust the free range and used range
592 if (dpsoftrast.texture_firstfree > index)
593 dpsoftrast.texture_firstfree = index;
594 while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
595 dpsoftrast.texture_end--;
597 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
599 int i, x, y, z, w, layer0, layer1, row0, row1;
600 unsigned char *o, *i0, *i1, *i2, *i3;
601 DPSOFTRAST_Texture *texture;
602 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
603 if (texture->mipmaps <= 1)
605 for (i = 1;i < texture->mipmaps;i++)
607 for (z = 0;z < texture->mipmap[i][4];z++)
611 if (layer1 >= texture->mipmap[i-1][4])
612 layer1 = texture->mipmap[i-1][4]-1;
613 for (y = 0;y < texture->mipmap[i][3];y++)
617 if (row1 >= texture->mipmap[i-1][3])
618 row1 = texture->mipmap[i-1][3]-1;
619 o = texture->bytes + texture->mipmap[i ][0] + 4*((texture->mipmap[i ][3] * z + y ) * texture->mipmap[i ][2]);
620 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
621 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
622 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
623 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
624 w = texture->mipmap[i][2];
627 if (texture->mipmap[i-1][2] > 1)
629 // average 3D texture
630 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
632 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
633 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
634 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
635 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
640 // average 3D mipmap with parent width == 1
641 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
643 o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
644 o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
645 o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
646 o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
652 if (texture->mipmap[i-1][2] > 1)
654 // average 2D texture (common case)
655 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
657 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
658 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
659 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
660 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
665 // 2D texture with parent width == 1
666 o[0] = (i0[0] + i1[0] + 1) >> 1;
667 o[1] = (i0[1] + i1[1] + 1) >> 1;
668 o[2] = (i0[2] + i1[2] + 1) >> 1;
669 o[3] = (i0[3] + i1[3] + 1) >> 1;
676 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
678 DPSOFTRAST_Texture *texture;
680 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
683 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
684 while (blockheight > 0)
686 memcpy(dst, pixels, blockwidth * 4);
687 pixels += blockwidth * 4;
688 dst += texture->mipmap[0][2] * 4;
691 DPSOFTRAST_Texture_CalculateMipmaps(index);
693 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
695 DPSOFTRAST_Texture *texture;
696 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
699 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
700 DPSOFTRAST_Texture_CalculateMipmaps(index);
702 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
704 DPSOFTRAST_Texture *texture;
705 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
706 return texture->mipmap[mip][2];
708 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
710 DPSOFTRAST_Texture *texture;
711 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
712 return texture->mipmap[mip][3];
714 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
716 DPSOFTRAST_Texture *texture;
717 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
718 return texture->mipmap[mip][4];
720 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
722 DPSOFTRAST_Texture *texture;
723 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
726 return texture->bytes + texture->mipmap[mip][0];
728 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
730 DPSOFTRAST_Texture *texture;
731 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
732 if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
734 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
739 texture->filter = filter;
742 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
744 if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
745 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
746 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
748 dpsoftrast.fb_width = width;
749 dpsoftrast.fb_height = height;
750 dpsoftrast.fb_depthpixels = depthpixels;
751 dpsoftrast.fb_colorpixels[0] = colorpixels0;
752 dpsoftrast.fb_colorpixels[1] = colorpixels1;
753 dpsoftrast.fb_colorpixels[2] = colorpixels2;
754 dpsoftrast.fb_colorpixels[3] = colorpixels3;
757 static void DPSOFTRAST_Draw_FlushThreads(void);
759 static void DPSOFTRAST_Draw_SyncCommands(void)
762 dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
765 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
768 DPSOFTRAST_State_Thread *thread;
770 int freecommand = dpsoftrast.commandpool.freecommand;
771 int usedcommands = dpsoftrast.commandpool.usedcommands;
772 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
774 DPSOFTRAST_Draw_SyncCommands();
780 for (i = 0; i < dpsoftrast.numthreads; i++)
782 thread = &dpsoftrast.threads[i];
783 commandoffset = freecommand - thread->commandoffset;
784 if (commandoffset < 0)
785 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
786 if (commandoffset > usedcommands)
789 usedcommands = commandoffset;
792 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
794 thread = &dpsoftrast.threads[waitindex];
795 SDL_LockMutex(thread->drawmutex);
796 if (thread->commandoffset != dpsoftrast.drawcommand)
798 thread->waiting = true;
799 if (thread->starving) SDL_CondSignal(thread->drawcond);
800 SDL_CondWait(thread->waitcond, thread->drawmutex);
801 thread->waiting = false;
803 SDL_UnlockMutex(thread->drawmutex);
805 dpsoftrast.commandpool.usedcommands = usedcommands;
807 DPSOFTRAST_Draw_FlushThreads();
811 #define DPSOFTRAST_ALIGNCOMMAND(size) \
812 ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
813 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
814 ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
816 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
818 DPSOFTRAST_Command *command;
819 int freecommand = dpsoftrast.commandpool.freecommand;
820 int usedcommands = dpsoftrast.commandpool.usedcommands;
821 int extra = sizeof(DPSOFTRAST_Command);
822 if(DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
823 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
824 if(usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
826 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
827 freecommand = dpsoftrast.commandpool.freecommand;
828 usedcommands = dpsoftrast.commandpool.usedcommands;
830 if(DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
832 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
833 command->opcode = DPSOFTRAST_OPCODE_Reset;
834 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
837 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
838 command->opcode = opcode;
839 command->commandsize = size;
841 if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
843 dpsoftrast.commandpool.freecommand = freecommand;
844 dpsoftrast.commandpool.usedcommands = usedcommands + size;
848 static void DPSOFTRAST_UndoCommand(int size)
850 int freecommand = dpsoftrast.commandpool.freecommand;
851 int usedcommands = dpsoftrast.commandpool.usedcommands;
853 usedcommands -= size;
854 dpsoftrast.commandpool.freecommand = freecommand;
855 dpsoftrast.commandpool.usedcommands = usedcommands;
858 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
859 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
861 thread->viewport[0] = command->x;
862 thread->viewport[1] = command->y;
863 thread->viewport[2] = command->width;
864 thread->viewport[3] = command->height;
865 thread->validate |= DPSOFTRAST_VALIDATE_FB;
867 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
869 DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
872 command->width = width;
873 command->height = height;
875 dpsoftrast.viewport[0] = x;
876 dpsoftrast.viewport[1] = y;
877 dpsoftrast.viewport[2] = width;
878 dpsoftrast.viewport[3] = height;
879 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
882 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;)
883 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
885 int i, x1, y1, x2, y2, w, h, x, y, t1, t2;
888 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
889 x1 = thread->fb_clearscissor[0];
890 y1 = thread->fb_clearscissor[1];
891 x2 = thread->fb_clearscissor[0] + thread->fb_clearscissor[2];
892 y2 = thread->fb_clearscissor[1] + thread->fb_clearscissor[3];
893 t1 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
894 t2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
901 // FIXME: honor fb_colormask?
902 c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
903 for (i = 0;i < 4;i++)
905 if (!dpsoftrast.fb_colorpixels[i])
907 for (y = y1;y < y2;y++)
909 p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
910 for (x = x1;x < x2;x++)
915 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
917 DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
924 DEFCOMMAND(3, ClearDepth, float depth;)
925 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
927 int x1, y1, x2, y2, w, h, x, y, t1, t2;
930 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
931 x1 = thread->fb_clearscissor[0];
932 y1 = thread->fb_clearscissor[1];
933 x2 = thread->fb_clearscissor[0] + thread->fb_clearscissor[2];
934 y2 = thread->fb_clearscissor[1] + thread->fb_clearscissor[3];
935 t1 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
936 t2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
943 c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
944 for (y = y1;y < y2;y++)
946 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
947 for (x = x1;x < x2;x++)
951 void DPSOFTRAST_ClearDepth(float d)
953 DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
957 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
958 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
960 thread->colormask[0] = command->r != 0;
961 thread->colormask[1] = command->g != 0;
962 thread->colormask[2] = command->b != 0;
963 thread->colormask[3] = command->a != 0;
964 thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
966 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
968 DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
975 DEFCOMMAND(5, DepthTest, int enable;)
976 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
978 thread->depthtest = command->enable;
979 thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
981 void DPSOFTRAST_DepthTest(int enable)
983 DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
984 command->enable = enable;
987 DEFCOMMAND(6, ScissorTest, int enable;)
988 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
990 thread->scissortest = command->enable;
991 thread->validate |= DPSOFTRAST_VALIDATE_FB;
993 void DPSOFTRAST_ScissorTest(int enable)
995 DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
996 command->enable = enable;
999 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1000 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1002 thread->scissor[0] = command->x;
1003 thread->scissor[1] = command->y;
1004 thread->scissor[2] = command->width;
1005 thread->scissor[3] = command->height;
1006 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1008 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1010 DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1013 command->width = width;
1014 command->height = height;
1017 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1018 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1020 thread->blendfunc[0] = command->sfactor;
1021 thread->blendfunc[1] = command->dfactor;
1022 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1024 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1026 DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1027 command->sfactor = sfactor;
1028 command->dfactor = dfactor;
1031 DEFCOMMAND(9, BlendSubtract, int enable;)
1032 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1034 thread->blendsubtract = command->enable;
1035 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1037 void DPSOFTRAST_BlendSubtract(int enable)
1039 DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1040 command->enable = enable;
1043 DEFCOMMAND(10, DepthMask, int enable;)
1044 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1046 thread->depthmask = command->enable;
1048 void DPSOFTRAST_DepthMask(int enable)
1050 DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1051 command->enable = enable;
1054 DEFCOMMAND(11, DepthFunc, int func;)
1055 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1057 thread->depthfunc = command->func;
1059 void DPSOFTRAST_DepthFunc(int func)
1061 DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1062 command->func = func;
1065 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1066 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1068 thread->depthrange[0] = command->nearval;
1069 thread->depthrange[1] = command->farval;
1071 void DPSOFTRAST_DepthRange(float nearval, float farval)
1073 DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1074 command->nearval = nearval;
1075 command->farval = farval;
1078 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1079 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1081 thread->polygonoffset[0] = command->alongnormal;
1082 thread->polygonoffset[1] = command->intoview;
1084 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1086 DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1087 command->alongnormal = alongnormal;
1088 command->intoview = intoview;
1091 DEFCOMMAND(14, CullFace, int mode;)
1092 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1094 thread->cullface = command->mode;
1096 void DPSOFTRAST_CullFace(int mode)
1098 DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1099 command->mode = mode;
1102 DEFCOMMAND(15, AlphaTest, int enable;)
1103 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1105 thread->alphatest = command->enable;
1107 void DPSOFTRAST_AlphaTest(int enable)
1109 DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1110 command->enable = enable;
1113 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1114 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1116 thread->alphafunc = command->func;
1117 thread->alphavalue = command->ref;
1119 void DPSOFTRAST_AlphaFunc(int func, float ref)
1121 DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1122 command->func = func;
1126 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1128 dpsoftrast.color[0] = r;
1129 dpsoftrast.color[1] = g;
1130 dpsoftrast.color[2] = b;
1131 dpsoftrast.color[3] = a;
1134 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1136 int outstride = blockwidth * 4;
1137 int instride = dpsoftrast.fb_width * 4;
1140 int bx2 = blockx + blockwidth;
1141 int by2 = blocky + blockheight;
1146 unsigned char *inpixels;
1150 if (bx1 < 0) bx1 = 0;
1151 if (by1 < 0) by1 = 0;
1152 if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1153 if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1156 inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1157 if (dpsoftrast.bigendian)
1159 for (y = by1;y < by2;y++)
1161 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1162 o = (unsigned char *)outpixels + (y - by1) * outstride;
1163 for (x = bx1;x < bx2;x++)
1176 for (y = by1;y < by2;y++)
1178 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1179 o = (unsigned char *)outpixels + (y - by1) * outstride;
1185 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1189 int tx2 = tx + width;
1190 int ty2 = ty + height;
1193 int sx2 = sx + width;
1194 int sy2 = sy + height;
1204 unsigned int *spixels;
1205 unsigned int *tpixels;
1206 DPSOFTRAST_Texture *texture;
1207 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1208 if (mip < 0 || mip >= texture->mipmaps) return;
1211 spixels = dpsoftrast.fb_colorpixels[0];
1212 swidth = dpsoftrast.fb_width;
1213 sheight = dpsoftrast.fb_height;
1214 tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1215 twidth = texture->mipmap[mip][2];
1216 theight = texture->mipmap[mip][3];
1217 if (tx1 < 0) tx1 = 0;
1218 if (ty1 < 0) ty1 = 0;
1219 if (tx2 > twidth) tx2 = twidth;
1220 if (ty2 > theight) ty2 = theight;
1221 if (sx1 < 0) sx1 = 0;
1222 if (sy1 < 0) sy1 = 0;
1223 if (sx2 > swidth) sx2 = swidth;
1224 if (sy2 > sheight) sy2 = sheight;
1229 if (tw > sw) tw = sw;
1230 if (th > sh) th = sh;
1231 if (tw < 1 || th < 1)
1233 for (y = 0;y < th;y++)
1234 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 + y) * swidth + sx1), tw*4);
1235 if (texture->mipmaps > 1)
1236 DPSOFTRAST_Texture_CalculateMipmaps(index);
1239 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1240 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1242 if (thread->texbound[command->unitnum])
1243 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1244 thread->texbound[command->unitnum] = command->texture;
1246 void DPSOFTRAST_SetTexture(int unitnum, int index)
1248 DPSOFTRAST_Command_SetTexture *command;
1249 DPSOFTRAST_Texture *texture;
1250 if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1252 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1255 texture = DPSOFTRAST_Texture_GetByIndex(index);
1256 if (index && !texture)
1258 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1262 command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1263 command->unitnum = unitnum;
1264 command->texture = texture;
1266 dpsoftrast.texbound[unitnum] = texture;
1267 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1270 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1272 dpsoftrast.pointer_vertex3f = vertex3f;
1273 dpsoftrast.stride_vertex = stride;
1275 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1277 dpsoftrast.pointer_color4f = color4f;
1278 dpsoftrast.pointer_color4ub = NULL;
1279 dpsoftrast.stride_color = stride;
1281 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1283 dpsoftrast.pointer_color4f = NULL;
1284 dpsoftrast.pointer_color4ub = color4ub;
1285 dpsoftrast.stride_color = stride;
1287 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1289 dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1290 dpsoftrast.components_texcoord[unitnum] = numcomponents;
1291 dpsoftrast.stride_texcoord[unitnum] = stride;
1294 DEFCOMMAND(18, SetShader, int mode; int permutation;)
1295 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1297 thread->shader_mode = command->mode;
1298 thread->shader_permutation = command->permutation;
1300 void DPSOFTRAST_SetShader(int mode, int permutation)
1302 DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1303 command->mode = mode;
1304 command->permutation = permutation;
1306 dpsoftrast.shader_mode = mode;
1307 dpsoftrast.shader_permutation = permutation;
1310 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1311 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1313 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1315 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1317 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1318 command->index = index;
1319 command->val[0] = v0;
1320 command->val[1] = v1;
1321 command->val[2] = v2;
1322 command->val[3] = v3;
1324 dpsoftrast.uniform4f[index*4+0] = v0;
1325 dpsoftrast.uniform4f[index*4+1] = v1;
1326 dpsoftrast.uniform4f[index*4+2] = v2;
1327 dpsoftrast.uniform4f[index*4+3] = v3;
1329 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1331 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1332 command->index = index;
1333 memcpy(command->val, v, sizeof(command->val));
1335 memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1338 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1339 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1341 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1343 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1347 for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1349 __m128 m0, m1, m2, m3;
1350 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1351 command->index = index;
1352 if (((size_t)v)&(ALIGN_SIZE-1))
1354 m0 = _mm_loadu_ps(v);
1355 m1 = _mm_loadu_ps(v+4);
1356 m2 = _mm_loadu_ps(v+8);
1357 m3 = _mm_loadu_ps(v+12);
1361 m0 = _mm_load_ps(v);
1362 m1 = _mm_load_ps(v+4);
1363 m2 = _mm_load_ps(v+8);
1364 m3 = _mm_load_ps(v+12);
1368 __m128 t0, t1, t2, t3;
1369 t0 = _mm_unpacklo_ps(m0, m1);
1370 t1 = _mm_unpacklo_ps(m2, m3);
1371 t2 = _mm_unpackhi_ps(m0, m1);
1372 t3 = _mm_unpackhi_ps(m2, m3);
1373 m0 = _mm_movelh_ps(t0, t1);
1374 m1 = _mm_movehl_ps(t1, t0);
1375 m2 = _mm_movelh_ps(t2, t3);
1376 m3 = _mm_movehl_ps(t3, t2);
1378 _mm_store_ps(command->val, m0);
1379 _mm_store_ps(command->val+4, m1);
1380 _mm_store_ps(command->val+8, m2);
1381 _mm_store_ps(command->val+12, m3);
1382 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1383 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1384 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1385 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1390 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1391 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1393 thread->uniform1i[command->index] = command->val;
1395 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1397 DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1398 command->index = index;
1401 dpsoftrast.uniform1i[command->index] = i0;
1405 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1407 float *end = dst + size*4;
1408 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1412 _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1421 _mm_store_ps(dst, _mm_load_ps((const float *)src));
1428 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1430 float *end = dst + size*4;
1431 if (stride == sizeof(float[3]))
1433 float *end4 = dst + (size&~3)*4;
1434 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1438 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv;
1439 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1440 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1441 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1442 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1443 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1444 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1445 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1446 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1447 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1448 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1449 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1450 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1452 src += 4*sizeof(float[3]);
1459 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1460 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1461 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1462 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1463 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1464 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1465 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1466 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1467 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1468 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1469 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1470 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1471 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1473 src += 4*sizeof(float[3]);
1477 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1481 __m128 v = _mm_loadu_ps((const float *)src);
1482 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1483 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1484 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1485 _mm_store_ps(dst, v);
1494 __m128 v = _mm_load_ps((const float *)src);
1495 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1496 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1497 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1498 _mm_store_ps(dst, v);
1505 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1507 float *end = dst + size*4;
1508 __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1509 if (stride == sizeof(float[2]))
1511 float *end2 = dst + (size&~1)*4;
1512 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1516 __m128 v = _mm_loadu_ps((const float *)src);
1517 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1518 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1520 src += 2*sizeof(float[2]);
1527 __m128 v = _mm_load_ps((const float *)src);
1528 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1529 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1531 src += 2*sizeof(float[2]);
1537 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1543 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1545 float *end = dst + size*4;
1546 __m128 scale = _mm_set1_ps(1.0f/255.0f);
1547 if (stride == sizeof(unsigned char[4]))
1549 float *end4 = dst + (size&~3)*4;
1550 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1554 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1555 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1556 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1557 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1558 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1560 src += 4*sizeof(unsigned char[4]);
1567 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1568 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1569 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1570 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1571 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1573 src += 4*sizeof(unsigned char[4]);
1579 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1580 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1586 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1588 float *end = dst + 4*size;
1589 __m128 v = _mm_loadu_ps(src);
1592 _mm_store_ps(dst, v);
1598 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1601 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1602 __m128 m0, m1, m2, m3;
1604 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1606 // fast case for identity matrix
1607 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1610 end = out4f + numitems*4;
1611 m0 = _mm_loadu_ps(inmatrix16f);
1612 m1 = _mm_loadu_ps(inmatrix16f + 4);
1613 m2 = _mm_loadu_ps(inmatrix16f + 8);
1614 m3 = _mm_loadu_ps(inmatrix16f + 12);
1615 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1619 __m128 v = _mm_loadu_ps(in4f);
1621 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1622 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1623 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1624 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1633 __m128 v = _mm_load_ps(in4f);
1635 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1636 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1637 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1638 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1646 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1648 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1652 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1654 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1655 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1656 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1657 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1660 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1662 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1663 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1664 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1665 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1668 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1671 out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1672 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1673 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1674 _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1677 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, __m128 minpos, __m128 maxpos, __m128 viewportcenter, __m128 viewportscale, __m128 m0, __m128 m1, __m128 m2, __m128 m3)
1679 int clipmask = 0xFF;
1680 __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1681 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1682 m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1683 m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1684 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1685 #define BBFRONT(k, pos) \
1687 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1688 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1689 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1692 clipmask &= ~(1<<k); \
1693 proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1694 minproj = _mm_min_ss(minproj, proj); \
1695 maxproj = _mm_max_ss(maxproj, proj); \
1699 BBFRONT(1, _mm_move_ss(minpos, maxpos));
1700 BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1701 BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1702 BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1703 BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1704 BBFRONT(6, _mm_move_ss(maxpos, minpos));
1708 if (clipmask&(1<<k)) \
1710 if (!(clipmask&(1<<(k^1)))) \
1712 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1713 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1714 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1715 minproj = _mm_min_ss(minproj, proj); \
1716 maxproj = _mm_max_ss(maxproj, proj); \
1718 if (!(clipmask&(1<<(k^2)))) \
1720 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1721 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1722 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1723 minproj = _mm_min_ss(minproj, proj); \
1724 maxproj = _mm_max_ss(maxproj, proj); \
1726 if (!(clipmask&(1<<(k^4)))) \
1728 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1729 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1730 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1731 minproj = _mm_min_ss(minproj, proj); \
1732 maxproj = _mm_max_ss(maxproj, proj); \
1736 BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1737 viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1738 viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1739 minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1740 maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1741 minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1742 maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1743 *starty = _mm_cvttss_si32(maxproj);
1744 *endy = _mm_cvttss_si32(minproj)+1;
1749 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1752 float *end = out4f + numitems*4;
1753 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1754 __m128 minpos, maxpos;
1755 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1757 minpos = maxpos = _mm_loadu_ps(in4f);
1760 __m128 v = _mm_loadu_ps(in4f);
1761 minpos = _mm_min_ps(minpos, v);
1762 maxpos = _mm_max_ps(maxpos, v);
1763 _mm_store_ps(out4f, v);
1764 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1765 _mm_store_ps(screen4f, v);
1773 minpos = maxpos = _mm_load_ps(in4f);
1776 __m128 v = _mm_load_ps(in4f);
1777 minpos = _mm_min_ps(minpos, v);
1778 maxpos = _mm_max_ps(maxpos, v);
1779 _mm_store_ps(out4f, v);
1780 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1781 _mm_store_ps(screen4f, v);
1788 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale,
1789 _mm_setr_ps(1.0f, 0.0f, 0.0f, 0.0f),
1790 _mm_setr_ps(0.0f, 1.0f, 0.0f, 0.0f),
1791 _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f),
1792 _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f));
1797 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1800 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1801 __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1803 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1804 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1805 end = out4f + numitems*4;
1806 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1807 viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1808 m0 = _mm_loadu_ps(inmatrix16f);
1809 m1 = _mm_loadu_ps(inmatrix16f + 4);
1810 m2 = _mm_loadu_ps(inmatrix16f + 8);
1811 m3 = _mm_loadu_ps(inmatrix16f + 12);
1812 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1814 minpos = maxpos = _mm_loadu_ps(in4f);
1817 __m128 v = _mm_loadu_ps(in4f);
1818 minpos = _mm_min_ps(minpos, v);
1819 maxpos = _mm_max_ps(maxpos, v);
1820 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1821 _mm_store_ps(out4f, v);
1822 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1823 _mm_store_ps(screen4f, v);
1831 minpos = maxpos = _mm_load_ps(in4f);
1834 __m128 v = _mm_load_ps(in4f);
1835 minpos = _mm_min_ps(minpos, v);
1836 maxpos = _mm_max_ps(maxpos, v);
1837 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1838 _mm_store_ps(out4f, v);
1839 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1840 _mm_store_ps(screen4f, v);
1847 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, m0, m1, m2, m3);
1852 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1854 float *outf = dpsoftrast.post_array4f[outarray];
1855 const unsigned char *inb;
1856 int firstvertex = dpsoftrast.firstvertex;
1857 int numvertices = dpsoftrast.numvertices;
1861 case DPSOFTRAST_ARRAY_POSITION:
1862 stride = dpsoftrast.stride_vertex;
1863 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1864 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1866 case DPSOFTRAST_ARRAY_COLOR:
1867 stride = dpsoftrast.stride_color;
1868 if (dpsoftrast.pointer_color4f)
1870 inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1871 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1873 else if (dpsoftrast.pointer_color4ub)
1875 stride = dpsoftrast.stride_color;
1876 inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1877 DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1881 DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1885 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1886 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1888 inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1889 switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1892 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1895 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1898 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1907 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1909 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1910 DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1915 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1917 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1918 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1923 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1925 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1926 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
1930 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1933 int startx = span->startx;
1934 int endx = span->endx;
1935 float wslope = triangle->w[0];
1936 float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1937 float endz = 1.0f / (w + wslope * startx);
1938 for (x = startx;x < endx;)
1940 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1942 if(nextsub >= endx) nextsub = endsub = endx-1;
1943 endz = 1.0f / (w + wslope * nextsub);
1944 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1945 for (; x <= endsub; x++, z += dz)
1950 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
1953 int startx = span->startx;
1954 int endx = span->endx;
1957 unsigned char * RESTRICT pixelmask = span->pixelmask;
1958 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1961 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
1962 // handle alphatest now (this affects depth writes too)
1963 if (thread->alphatest)
1964 for (x = startx;x < endx;x++)
1965 if (in4f[x*4+3] < 0.5f)
1966 pixelmask[x] = false;
1967 // FIXME: this does not handle bigendian
1968 switch(thread->fb_blendmode)
1970 case DPSOFTRAST_BLENDMODE_OPAQUE:
1971 for (x = startx;x < endx;x++)
1975 d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
1976 d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
1977 d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
1978 d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
1979 pixel[x*4+0] = d[0];
1980 pixel[x*4+1] = d[1];
1981 pixel[x*4+2] = d[2];
1982 pixel[x*4+3] = d[3];
1985 case DPSOFTRAST_BLENDMODE_ALPHA:
1986 for (x = startx;x < endx;x++)
1990 a = in4f[x*4+3] * 255.0f;
1991 b = 1.0f - in4f[x*4+3];
1992 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
1993 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
1994 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
1995 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
1996 pixel[x*4+0] = d[0];
1997 pixel[x*4+1] = d[1];
1998 pixel[x*4+2] = d[2];
1999 pixel[x*4+3] = d[3];
2002 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2003 for (x = startx;x < endx;x++)
2007 a = in4f[x*4+3] * 255.0f;
2008 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2009 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2010 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2011 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2012 pixel[x*4+0] = d[0];
2013 pixel[x*4+1] = d[1];
2014 pixel[x*4+2] = d[2];
2015 pixel[x*4+3] = d[3];
2018 case DPSOFTRAST_BLENDMODE_ADD:
2019 for (x = startx;x < endx;x++)
2023 d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2024 d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2025 d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2026 d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2027 pixel[x*4+0] = d[0];
2028 pixel[x*4+1] = d[1];
2029 pixel[x*4+2] = d[2];
2030 pixel[x*4+3] = d[3];
2033 case DPSOFTRAST_BLENDMODE_INVMOD:
2034 for (x = startx;x < endx;x++)
2038 d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2039 d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2040 d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2041 d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2042 pixel[x*4+0] = d[0];
2043 pixel[x*4+1] = d[1];
2044 pixel[x*4+2] = d[2];
2045 pixel[x*4+3] = d[3];
2048 case DPSOFTRAST_BLENDMODE_MUL:
2049 for (x = startx;x < endx;x++)
2053 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2054 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2055 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2056 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2057 pixel[x*4+0] = d[0];
2058 pixel[x*4+1] = d[1];
2059 pixel[x*4+2] = d[2];
2060 pixel[x*4+3] = d[3];
2063 case DPSOFTRAST_BLENDMODE_MUL2:
2064 for (x = startx;x < endx;x++)
2068 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2069 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2070 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2071 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2072 pixel[x*4+0] = d[0];
2073 pixel[x*4+1] = d[1];
2074 pixel[x*4+2] = d[2];
2075 pixel[x*4+3] = d[3];
2078 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2079 for (x = startx;x < endx;x++)
2083 a = in4f[x*4+3] * -255.0f;
2084 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2085 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2086 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2087 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2088 pixel[x*4+0] = d[0];
2089 pixel[x*4+1] = d[1];
2090 pixel[x*4+2] = d[2];
2091 pixel[x*4+3] = d[3];
2094 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2095 for (x = startx;x < endx;x++)
2100 b = 1.0f - in4f[x*4+3];
2101 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2102 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2103 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2104 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2105 pixel[x*4+0] = d[0];
2106 pixel[x*4+1] = d[1];
2107 pixel[x*4+2] = d[2];
2108 pixel[x*4+3] = d[3];
2114 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2118 int startx = span->startx;
2119 int endx = span->endx;
2120 const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2121 unsigned char * RESTRICT pixelmask = span->pixelmask;
2122 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2123 unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2126 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2127 pixeli += span->y * dpsoftrast.fb_width + span->x;
2128 // handle alphatest now (this affects depth writes too)
2129 if (thread->alphatest)
2130 for (x = startx;x < endx;x++)
2131 if (in4ub[x*4+3] < 0.5f)
2132 pixelmask[x] = false;
2133 // FIXME: this does not handle bigendian
2134 switch(thread->fb_blendmode)
2136 case DPSOFTRAST_BLENDMODE_OPAQUE:
2137 for (x = startx;x + 4 <= endx;)
2139 if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2141 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2155 case DPSOFTRAST_BLENDMODE_ALPHA:
2156 #define FINISHBLEND(blend2, blend1) \
2157 for (x = startx;x + 2 <= endx;x += 2) \
2160 switch (*(const unsigned short*)&pixelmask[x]) \
2163 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2164 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2166 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2169 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2170 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2172 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2175 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2176 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2178 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2183 for(;x < endx; x++) \
2186 if (!pixelmask[x]) \
2188 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2189 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2191 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2195 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2196 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2198 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2199 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2202 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2204 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2205 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2207 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2208 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2211 case DPSOFTRAST_BLENDMODE_ADD:
2212 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2214 case DPSOFTRAST_BLENDMODE_INVMOD:
2216 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2218 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2221 case DPSOFTRAST_BLENDMODE_MUL:
2222 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2224 case DPSOFTRAST_BLENDMODE_MUL2:
2225 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2227 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2229 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2230 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2232 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2233 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2236 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2238 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2239 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2241 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2242 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2249 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2252 int startx = span->startx;
2253 int endx = span->endx;
2258 float tc[2], endtc[2];
2260 unsigned int tci[2];
2261 unsigned int tci1[2];
2262 unsigned int tcimin[2];
2263 unsigned int tcimax[2];
2268 const unsigned char * RESTRICT pixelbase;
2269 const unsigned char * RESTRICT pixel[4];
2270 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2271 // if no texture is bound, just fill it with white
2274 for (x = startx;x < endx;x++)
2276 out4f[x*4+0] = 1.0f;
2277 out4f[x*4+1] = 1.0f;
2278 out4f[x*4+2] = 1.0f;
2279 out4f[x*4+3] = 1.0f;
2283 mip = triangle->mip[texunitindex];
2284 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2285 // if this mipmap of the texture is 1 pixel, just fill it with that color
2286 if (texture->mipmap[mip][1] == 4)
2288 c[0] = texture->bytes[2] * (1.0f/255.0f);
2289 c[1] = texture->bytes[1] * (1.0f/255.0f);
2290 c[2] = texture->bytes[0] * (1.0f/255.0f);
2291 c[3] = texture->bytes[3] * (1.0f/255.0f);
2292 for (x = startx;x < endx;x++)
2294 out4f[x*4+0] = c[0];
2295 out4f[x*4+1] = c[1];
2296 out4f[x*4+2] = c[2];
2297 out4f[x*4+3] = c[3];
2301 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2302 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2303 flags = texture->flags;
2304 tcscale[0] = texture->mipmap[mip][2];
2305 tcscale[1] = texture->mipmap[mip][3];
2306 tciwidth = texture->mipmap[mip][2];
2309 tcimax[0] = texture->mipmap[mip][2]-1;
2310 tcimax[1] = texture->mipmap[mip][3]-1;
2311 tciwrapmask[0] = texture->mipmap[mip][2]-1;
2312 tciwrapmask[1] = texture->mipmap[mip][3]-1;
2313 endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
2314 endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
2315 for (x = startx;x < endx;)
2317 unsigned int subtc[2];
2318 unsigned int substep[2];
2319 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2320 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2323 nextsub = endsub = endx-1;
2324 if(x < nextsub) subscale = 65536.0f / (nextsub - x);
2328 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
2329 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
2330 substep[0] = (endtc[0] - tc[0]) * subscale;
2331 substep[1] = (endtc[1] - tc[1]) * subscale;
2332 subtc[0] = tc[0] * (1<<16);
2333 subtc[1] = tc[1] * (1<<16);
2336 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2338 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2340 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2341 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2342 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2343 tci[0] = subtc[0]>>16;
2344 tci[1] = subtc[1]>>16;
2345 tci1[0] = tci[0] + 1;
2346 tci1[1] = tci[1] + 1;
2347 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2348 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2349 tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2350 tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2351 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2352 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2353 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2354 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2355 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2356 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2357 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2358 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2359 out4f[x*4+0] = c[0];
2360 out4f[x*4+1] = c[1];
2361 out4f[x*4+2] = c[2];
2362 out4f[x*4+3] = c[3];
2367 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2369 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2370 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2371 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2372 tci[0] = subtc[0]>>16;
2373 tci[1] = subtc[1]>>16;
2374 tci1[0] = tci[0] + 1;
2375 tci1[1] = tci[1] + 1;
2376 tci[0] &= tciwrapmask[0];
2377 tci[1] &= tciwrapmask[1];
2378 tci1[0] &= tciwrapmask[0];
2379 tci1[1] &= tciwrapmask[1];
2380 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2381 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2382 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2383 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2384 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2385 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2386 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2387 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2388 out4f[x*4+0] = c[0];
2389 out4f[x*4+1] = c[1];
2390 out4f[x*4+2] = c[2];
2391 out4f[x*4+3] = c[3];
2395 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2397 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2399 tci[0] = subtc[0]>>16;
2400 tci[1] = subtc[1]>>16;
2401 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2402 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2403 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2404 c[0] = pixel[0][2] * (1.0f / 255.0f);
2405 c[1] = pixel[0][1] * (1.0f / 255.0f);
2406 c[2] = pixel[0][0] * (1.0f / 255.0f);
2407 c[3] = pixel[0][3] * (1.0f / 255.0f);
2408 out4f[x*4+0] = c[0];
2409 out4f[x*4+1] = c[1];
2410 out4f[x*4+2] = c[2];
2411 out4f[x*4+3] = c[3];
2416 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2418 tci[0] = subtc[0]>>16;
2419 tci[1] = subtc[1]>>16;
2420 tci[0] &= tciwrapmask[0];
2421 tci[1] &= tciwrapmask[1];
2422 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2423 c[0] = pixel[0][2] * (1.0f / 255.0f);
2424 c[1] = pixel[0][1] * (1.0f / 255.0f);
2425 c[2] = pixel[0][0] * (1.0f / 255.0f);
2426 c[3] = pixel[0][3] * (1.0f / 255.0f);
2427 out4f[x*4+0] = c[0];
2428 out4f[x*4+1] = c[1];
2429 out4f[x*4+2] = c[2];
2430 out4f[x*4+3] = c[3];
2436 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2440 int startx = span->startx;
2441 int endx = span->endx;
2443 __m128 data, slope, tcscale;
2444 __m128i tcsize, tcmask, tcoffset, tcmax;
2446 __m128i subtc, substep, endsubtc;
2449 unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2450 const unsigned char * RESTRICT pixelbase;
2451 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2452 // if no texture is bound, just fill it with white
2455 memset(out4ub + startx*4, 255, span->length*4);
2458 mip = triangle->mip[texunitindex];
2459 pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2460 // if this mipmap of the texture is 1 pixel, just fill it with that color
2461 if (texture->mipmap[mip][1] == 4)
2463 unsigned int k = *((const unsigned int *)pixelbase);
2464 for (x = startx;x < endx;x++)
2468 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2469 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2470 flags = texture->flags;
2471 tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2472 tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2473 tcscale = _mm_cvtepi32_ps(tcsize);
2474 data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2475 slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2476 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
2477 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2478 tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2479 tcmax = _mm_packs_epi32(tcmask, tcmask);
2480 for (x = startx;x < endx;)
2482 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2483 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2486 nextsub = endsub = endx-1;
2487 if(x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2491 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
2492 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2493 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2494 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2495 substep = _mm_slli_epi32(substep, 1);
2498 __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2499 if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2501 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2502 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2504 const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;
2505 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2506 tci = _mm_madd_epi16(tci, tcoffset);
2507 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2508 ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2509 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2510 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2511 pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2512 pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2513 fracm = _mm_srli_epi16(subtc, 1);
2514 pix1 = _mm_add_epi16(pix1,
2515 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2516 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2517 pix3 = _mm_add_epi16(pix3,
2518 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2519 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2520 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2521 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2522 pix2 = _mm_add_epi16(pix2,
2523 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2524 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2525 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2529 const unsigned char * RESTRICT ptr1;
2530 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2531 tci = _mm_madd_epi16(tci, tcoffset);
2532 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2533 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2534 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2535 fracm = _mm_srli_epi16(subtc, 1);
2536 pix1 = _mm_add_epi16(pix1,
2537 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2538 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2539 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2540 pix1 = _mm_add_epi16(pix1,
2541 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2542 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2543 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2547 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2549 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2551 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2552 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2553 tci = _mm_madd_epi16(tci, tcoffset);
2554 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2555 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2556 _mm_setzero_si128());
2557 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2558 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2559 _mm_setzero_si128());
2560 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2561 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2562 tci = _mm_madd_epi16(tci, tcoffset);
2563 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2564 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2565 _mm_setzero_si128());
2566 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2567 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2568 _mm_setzero_si128());
2569 fracm = _mm_srli_epi16(subtc, 1);
2570 pix1 = _mm_add_epi16(pix1,
2571 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2572 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2573 pix3 = _mm_add_epi16(pix3,
2574 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2575 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2576 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2577 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2578 pix2 = _mm_add_epi16(pix2,
2579 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2580 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2581 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2585 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2586 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2587 tci = _mm_madd_epi16(tci, tcoffset);
2588 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2589 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2590 _mm_setzero_si128());
2591 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2592 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2593 _mm_setzero_si128());
2594 fracm = _mm_srli_epi16(subtc, 1);
2595 pix1 = _mm_add_epi16(pix1,
2596 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2597 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2598 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2599 pix1 = _mm_add_epi16(pix1,
2600 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2601 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2602 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2608 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2610 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2611 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2612 tci = _mm_madd_epi16(tci, tcoffset);
2613 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2614 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2615 _mm_setzero_si128());
2616 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2617 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2618 _mm_setzero_si128());
2619 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2620 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2621 tci = _mm_madd_epi16(tci, tcoffset);
2622 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2623 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2624 _mm_setzero_si128());
2625 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2626 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2627 _mm_setzero_si128());
2628 fracm = _mm_srli_epi16(subtc, 1);
2629 pix1 = _mm_add_epi16(pix1,
2630 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2631 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2632 pix3 = _mm_add_epi16(pix3,
2633 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2634 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2635 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2636 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2637 pix2 = _mm_add_epi16(pix2,
2638 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2639 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2640 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2644 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2645 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2646 tci = _mm_madd_epi16(tci, tcoffset);
2647 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2648 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2649 _mm_setzero_si128());
2650 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2651 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2652 _mm_setzero_si128());
2653 fracm = _mm_srli_epi16(subtc, 1);
2654 pix1 = _mm_add_epi16(pix1,
2655 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2656 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2657 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2658 pix1 = _mm_add_epi16(pix1,
2659 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2660 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2661 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2668 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2670 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2672 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2673 tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2674 tci = _mm_madd_epi16(tci, tcoffset);
2675 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2676 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2680 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2681 tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2682 tci = _mm_madd_epi16(tci, tcoffset);
2683 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2689 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2691 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2692 tci = _mm_and_si128(tci, tcmax);
2693 tci = _mm_madd_epi16(tci, tcoffset);
2694 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2695 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2699 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2700 tci = _mm_and_si128(tci, tcmax);
2701 tci = _mm_madd_epi16(tci, tcoffset);
2702 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2711 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2714 memset(out4ub, 255, span->length*4);
2717 float DPSOFTRAST_SampleShadowmap(const float *vector)
2723 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2726 int startx = span->startx;
2727 int endx = span->endx;
2732 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2733 for (x = startx;x < endx;x++)
2736 c[0] = (data[0] + slope[0]*x) * z;
2737 c[1] = (data[1] + slope[1]*x) * z;
2738 c[2] = (data[2] + slope[2]*x) * z;
2739 c[3] = (data[3] + slope[3]*x) * z;
2740 out4f[x*4+0] = in4f[x*4+0] * c[0];
2741 out4f[x*4+1] = in4f[x*4+1] * c[1];
2742 out4f[x*4+2] = in4f[x*4+2] * c[2];
2743 out4f[x*4+3] = in4f[x*4+3] * c[3];
2747 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2750 int startx = span->startx;
2751 int endx = span->endx;
2756 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2757 for (x = startx;x < endx;x++)
2760 c[0] = (data[0] + slope[0]*x) * z;
2761 c[1] = (data[1] + slope[1]*x) * z;
2762 c[2] = (data[2] + slope[2]*x) * z;
2763 c[3] = (data[3] + slope[3]*x) * z;
2764 out4f[x*4+0] = c[0];
2765 out4f[x*4+1] = c[1];
2766 out4f[x*4+2] = c[2];
2767 out4f[x*4+3] = c[3];
2771 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2773 int x, startx = span->startx, endx = span->endx;
2774 float c[4], localcolor[4];
2775 localcolor[0] = subcolor[0];
2776 localcolor[1] = subcolor[1];
2777 localcolor[2] = subcolor[2];
2778 localcolor[3] = subcolor[3];
2779 for (x = startx;x < endx;x++)
2781 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2782 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2783 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2784 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2785 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2786 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2787 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2788 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2792 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2794 int x, startx = span->startx, endx = span->endx;
2795 for (x = startx;x < endx;x++)
2797 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2798 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2799 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2800 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2804 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2806 int x, startx = span->startx, endx = span->endx;
2807 for (x = startx;x < endx;x++)
2809 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2810 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2811 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2812 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2816 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2818 int x, startx = span->startx, endx = span->endx;
2820 for (x = startx;x < endx;x++)
2822 a = 1.0f - inb4f[x*4+3];
2824 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2825 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2826 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2827 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2831 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2833 int x, startx = span->startx, endx = span->endx;
2834 float localcolor[4], ilerp, lerp;
2835 localcolor[0] = color[0];
2836 localcolor[1] = color[1];
2837 localcolor[2] = color[2];
2838 localcolor[3] = color[3];
2839 ilerp = 1.0f - localcolor[3];
2840 lerp = localcolor[3];
2841 for (x = startx;x < endx;x++)
2843 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2844 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2845 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2846 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2852 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2856 int startx = span->startx;
2857 int endx = span->endx;
2860 __m128i submod, substep, endsubmod;
2861 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2862 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2863 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2864 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2865 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2866 for (x = startx; x < endx;)
2868 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2869 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2872 nextsub = endsub = endx-1;
2873 if(x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2877 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2878 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2879 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2880 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2881 substep = _mm_packs_epi32(substep, substep);
2882 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2884 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2885 pix = _mm_mulhi_epu16(pix, submod);
2886 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2890 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2891 pix = _mm_mulhi_epu16(pix, submod);
2892 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2899 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2903 int startx = span->startx;
2904 int endx = span->endx;
2907 __m128i submod, substep, endsubmod;
2908 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2909 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2910 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2911 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2912 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2913 for (x = startx; x < endx;)
2915 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2916 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2919 nextsub = endsub = endx-1;
2920 if(x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
2924 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2925 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2926 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2927 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2928 substep = _mm_packs_epi32(substep, substep);
2929 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2931 __m128i pix = _mm_srai_epi16(submod, 4);
2932 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2936 __m128i pix = _mm_srai_epi16(submod, 4);
2937 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2944 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
2947 int x, startx = span->startx, endx = span->endx;
2948 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
2949 localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
2950 for (x = startx;x+2 <= endx;x+=2)
2952 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2953 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2954 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2955 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2959 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2960 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
2961 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2962 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2967 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
2970 int x, startx = span->startx, endx = span->endx;
2971 for (x = startx;x+2 <= endx;x+=2)
2973 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2974 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
2975 pix1 = _mm_mulhi_epu16(pix1, pix2);
2976 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2980 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2981 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
2982 pix1 = _mm_mulhi_epu16(pix1, pix2);
2983 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2988 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
2991 int x, startx = span->startx, endx = span->endx;
2992 for (x = startx;x+2 <= endx;x+=2)
2994 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2995 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2996 pix1 = _mm_add_epi16(pix1, pix2);
2997 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3001 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3002 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3003 pix1 = _mm_add_epi16(pix1, pix2);
3004 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3009 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3012 int x, startx = span->startx, endx = span->endx;
3013 __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3014 tint = _mm_shuffle_epi32(_mm_packs_epi32(tint, tint), _MM_SHUFFLE(1, 0, 1, 0));
3015 for (x = startx;x+2 <= endx;x+=2)
3017 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3018 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3019 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3020 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3024 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3025 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3026 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3027 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3032 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3035 int x, startx = span->startx, endx = span->endx;
3036 for (x = startx;x+2 <= endx;x+=2)
3038 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3039 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3040 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3041 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3042 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3046 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3047 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3048 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3049 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3050 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3055 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3058 int x, startx = span->startx, endx = span->endx;
3059 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3060 localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
3061 blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3062 for (x = startx;x+2 <= endx;x+=2)
3064 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3065 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3066 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3070 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3071 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3072 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3079 void DPSOFTRAST_VertexShader_Generic(void)
3081 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3082 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3083 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3084 if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3085 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3088 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3090 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3091 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3092 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3093 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3094 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3095 if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3097 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3098 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3099 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3101 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3102 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3105 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3107 else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3110 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3112 else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3115 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3120 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3121 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3126 void DPSOFTRAST_VertexShader_PostProcess(void)
3128 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3129 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3130 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3133 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3135 // TODO: optimize!! at the very least there is no reason to use texture sampling on the frame texture
3136 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3137 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3138 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3139 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3140 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3141 if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3143 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3144 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3146 DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3147 if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3149 // TODO: implement saturation
3151 if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3153 // TODO: implement gammaramps
3155 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3160 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3162 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3165 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3167 // this is never called (because colormask is off when this shader is used)
3168 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3169 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3170 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3171 memset(buffer_FragColorbgra8, 0, span->length*4);
3172 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3177 void DPSOFTRAST_VertexShader_FlatColor(void)
3179 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3180 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3183 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3185 int x, startx = span->startx, endx = span->endx;
3186 int Color_Ambienti[4];
3187 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3188 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3189 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3190 Color_Ambienti[2] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0]*256.0f);
3191 Color_Ambienti[1] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1]*256.0f);
3192 Color_Ambienti[0] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2]*256.0f);
3193 Color_Ambienti[3] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0] *256.0f);
3194 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3195 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3196 for (x = startx;x < endx;x++)
3198 buffer_FragColorbgra8[x*4+0] = (buffer_texture_colorbgra8[x*4+0] * Color_Ambienti[0])>>8;
3199 buffer_FragColorbgra8[x*4+1] = (buffer_texture_colorbgra8[x*4+1] * Color_Ambienti[1])>>8;
3200 buffer_FragColorbgra8[x*4+2] = (buffer_texture_colorbgra8[x*4+2] * Color_Ambienti[2])>>8;
3201 buffer_FragColorbgra8[x*4+3] = (buffer_texture_colorbgra8[x*4+3] * Color_Ambienti[3])>>8;
3203 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3208 void DPSOFTRAST_VertexShader_VertexColor(void)
3210 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3211 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3212 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3215 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3218 unsigned char * RESTRICT pixelmask = span->pixelmask;
3219 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3220 int x, startx = span->startx, endx = span->endx;
3221 __m128i Color_Ambientm, Color_Diffusem;
3223 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3224 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3225 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3226 int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3227 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3228 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3229 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3230 pixel = buffer_FragColorbgra8;
3231 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3232 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3233 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3234 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3235 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3236 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3237 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3238 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3239 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3240 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3241 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3242 data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3243 slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3244 for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3246 __m128i color, mod, pix;
3247 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3250 __m128 z = _mm_loadu_ps(&buffer_z[x]);
3251 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3252 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3253 data = _mm_add_ps(data, slope);
3254 mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3255 data = _mm_add_ps(data, slope);
3256 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3257 data = _mm_add_ps(data, slope);
3258 mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3259 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3260 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3261 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3262 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3263 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3269 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3270 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x])));
3271 mod = _mm_packs_epi32(mod, mod);
3272 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3273 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3275 if(pixel == buffer_FragColorbgra8)
3276 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3282 void DPSOFTRAST_VertexShader_Lightmap(void)
3284 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3285 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3286 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3289 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3292 unsigned char * RESTRICT pixelmask = span->pixelmask;
3293 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3294 int x, startx = span->startx, endx = span->endx;
3295 __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3296 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3297 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3298 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3299 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3300 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3301 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3302 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3303 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3304 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3305 pixel = buffer_FragColorbgra8;
3306 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3307 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3308 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3309 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3310 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3311 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3312 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3313 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3315 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3316 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3317 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3318 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3319 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3320 for (x = startx;x < endx;x++)
3322 __m128i color, lightmap, glow, pix;
3323 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3326 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3327 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3328 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3329 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3330 _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3331 _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3332 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3333 _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3334 _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3335 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3341 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3342 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3343 glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3344 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3345 pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3346 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3351 for (x = startx;x < endx;x++)
3353 __m128i color, lightmap, pix;
3354 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3357 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3358 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3359 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3360 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3361 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3362 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3363 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3369 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3370 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3371 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3372 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3375 if(pixel == buffer_FragColorbgra8)
3376 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3382 void DPSOFTRAST_VertexShader_FakeLight(void)
3384 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3387 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3390 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3391 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3392 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3393 memset(buffer_FragColorbgra8, 0, span->length*4);
3394 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3399 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3401 DPSOFTRAST_VertexShader_Lightmap();
3404 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3406 DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3412 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3414 DPSOFTRAST_VertexShader_Lightmap();
3417 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3419 DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3425 void DPSOFTRAST_VertexShader_LightDirection(void)
3428 int numvertices = dpsoftrast.numvertices;
3430 float LightVector[4];
3431 float EyePosition[4];
3432 float EyeVectorModelSpace[4];
3438 LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3439 LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3440 LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3441 LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3442 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3443 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3444 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3445 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3446 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3447 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3448 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3449 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3450 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3451 for (i = 0;i < numvertices;i++)
3453 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3454 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3455 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3456 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3457 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3458 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3459 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3460 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3461 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3462 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3463 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3464 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3465 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3466 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3467 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3468 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3469 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3470 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3471 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3472 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3473 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3474 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3475 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3476 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3477 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3478 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3479 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3480 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3481 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3483 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3486 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3487 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3488 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3489 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3490 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3491 #define DPSOFTRAST_Vector3Normalize(v)\
3494 float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3505 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3507 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3508 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3509 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3510 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3511 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3512 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3513 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3514 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3515 int x, startx = span->startx, endx = span->endx;
3516 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3517 float LightVectordata[4];
3518 float LightVectorslope[4];
3519 float EyeVectordata[4];
3520 float EyeVectorslope[4];
3522 float diffusetex[4];
3524 float surfacenormal[4];
3525 float lightnormal[4];
3527 float specularnormal[4];
3530 float SpecularPower;
3532 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3533 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3534 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3535 Color_Glow[3] = 0.0f;
3536 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3537 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3538 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3539 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3540 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3541 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3542 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3543 Color_Pants[3] = 0.0f;
3544 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3545 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3546 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3547 Color_Shirt[3] = 0.0f;
3548 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3549 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3550 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3552 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3553 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3555 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3557 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3559 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3561 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3562 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3563 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3564 Color_Diffuse[3] = 0.0f;
3565 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3566 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3567 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3568 LightColor[3] = 0.0f;
3569 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3570 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3571 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3572 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3573 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3574 Color_Specular[3] = 0.0f;
3575 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3576 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3577 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3578 for (x = startx;x < endx;x++)
3581 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3582 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3583 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3584 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3585 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3587 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3588 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3589 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3590 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3592 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3593 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3594 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3595 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3596 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3597 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3598 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3599 DPSOFTRAST_Vector3Normalize(surfacenormal);
3601 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3602 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3603 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3604 DPSOFTRAST_Vector3Normalize(lightnormal);
3606 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3607 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3608 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3609 DPSOFTRAST_Vector3Normalize(eyenormal);
3611 specularnormal[0] = lightnormal[0] + eyenormal[0];
3612 specularnormal[1] = lightnormal[1] + eyenormal[1];
3613 specularnormal[2] = lightnormal[2] + eyenormal[2];
3614 DPSOFTRAST_Vector3Normalize(specularnormal);
3616 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3617 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3618 specular = pow(specular, SpecularPower * glosstex[3]);
3619 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3621 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3622 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3623 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3624 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3628 d[0] = (int)( diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3629 d[1] = (int)( diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3630 d[2] = (int)( diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3631 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3633 buffer_FragColorbgra8[x*4+0] = d[0];
3634 buffer_FragColorbgra8[x*4+1] = d[1];
3635 buffer_FragColorbgra8[x*4+2] = d[2];
3636 buffer_FragColorbgra8[x*4+3] = d[3];
3639 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3641 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3642 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3643 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3644 Color_Diffuse[3] = 0.0f;
3645 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3646 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3647 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3648 LightColor[3] = 0.0f;
3649 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3650 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3651 for (x = startx;x < endx;x++)
3654 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3655 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3656 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3657 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3658 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3659 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3660 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3661 DPSOFTRAST_Vector3Normalize(surfacenormal);
3663 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3664 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3665 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3666 DPSOFTRAST_Vector3Normalize(lightnormal);
3668 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3669 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3671 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3672 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3673 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3674 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3678 d[0] = (int)( + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3679 d[1] = (int)( + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3680 d[2] = (int)( + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3681 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3683 buffer_FragColorbgra8[x*4+0] = d[0];
3684 buffer_FragColorbgra8[x*4+1] = d[1];
3685 buffer_FragColorbgra8[x*4+2] = d[2];
3686 buffer_FragColorbgra8[x*4+3] = d[3];
3691 for (x = startx;x < endx;x++)
3694 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3695 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3696 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3697 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3699 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3701 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3702 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3703 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3704 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3708 d[0] = (int)( diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3709 d[1] = (int)( diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3710 d[2] = (int)( diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3711 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3713 buffer_FragColorbgra8[x*4+0] = d[0];
3714 buffer_FragColorbgra8[x*4+1] = d[1];
3715 buffer_FragColorbgra8[x*4+2] = d[2];
3716 buffer_FragColorbgra8[x*4+3] = d[3];
3719 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3724 void DPSOFTRAST_VertexShader_LightSource(void)
3727 int numvertices = dpsoftrast.numvertices;
3728 float LightPosition[4];
3729 float LightVector[4];
3730 float LightVectorModelSpace[4];
3731 float EyePosition[4];
3732 float EyeVectorModelSpace[4];
3738 LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
3739 LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
3740 LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
3741 LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
3742 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3743 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3744 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3745 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3746 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3747 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3748 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3749 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3750 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3751 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3752 for (i = 0;i < numvertices;i++)
3754 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3755 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3756 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3757 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3758 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3759 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3760 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3761 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3762 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3763 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3764 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3765 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3766 LightVectorModelSpace[0] = LightPosition[0] - position[0];
3767 LightVectorModelSpace[1] = LightPosition[1] - position[1];
3768 LightVectorModelSpace[2] = LightPosition[2] - position[2];
3769 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
3770 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
3771 LightVector[2] = normal[0] * LightVectorModelSpace[0] + normal[1] * LightVectorModelSpace[1] + normal[2] * LightVectorModelSpace[2];
3772 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3773 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3774 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3775 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3776 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3777 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3778 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3779 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3780 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3781 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3782 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3783 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3784 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3785 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3787 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3788 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
3791 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3794 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3795 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3796 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3797 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3798 unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3799 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3800 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3801 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3802 int x, startx = span->startx, endx = span->endx;
3803 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3804 float CubeVectordata[4];
3805 float CubeVectorslope[4];
3806 float LightVectordata[4];
3807 float LightVectorslope[4];
3808 float EyeVectordata[4];
3809 float EyeVectorslope[4];
3811 float diffusetex[4];
3813 float surfacenormal[4];
3814 float lightnormal[4];
3816 float specularnormal[4];
3819 float SpecularPower;
3820 float CubeVector[4];
3823 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3824 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3825 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3826 Color_Glow[3] = 0.0f;
3827 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3828 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3829 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3830 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3831 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3832 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3833 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3834 Color_Diffuse[3] = 0.0f;
3835 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3836 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3837 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3838 Color_Specular[3] = 0.0f;
3839 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3840 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3841 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3842 Color_Pants[3] = 0.0f;
3843 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3844 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3845 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3846 Color_Shirt[3] = 0.0f;
3847 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3848 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3849 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3850 LightColor[3] = 0.0f;
3851 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3852 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3853 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3854 DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3855 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3856 memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
3857 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3858 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3860 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3861 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3863 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3864 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
3865 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3867 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3868 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3869 for (x = startx;x < endx;x++)
3872 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3873 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3874 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3875 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3876 if (attenuation < 0.01f)
3878 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3880 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3881 if (attenuation < 0.01f)
3885 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3886 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3887 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3888 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3889 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3891 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3892 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3893 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3894 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3896 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3897 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3898 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3899 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3900 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3901 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3902 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3903 DPSOFTRAST_Vector3Normalize(surfacenormal);
3905 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3906 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3907 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3908 DPSOFTRAST_Vector3Normalize(lightnormal);
3910 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3911 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3912 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3913 DPSOFTRAST_Vector3Normalize(eyenormal);
3915 specularnormal[0] = lightnormal[0] + eyenormal[0];
3916 specularnormal[1] = lightnormal[1] + eyenormal[1];
3917 specularnormal[2] = lightnormal[2] + eyenormal[2];
3918 DPSOFTRAST_Vector3Normalize(specularnormal);
3920 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3921 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3922 specular = pow(specular, SpecularPower * glosstex[3]);
3923 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3925 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3926 attenuation *= (1.0f / 255.0f);
3927 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3928 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3929 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3930 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
3934 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
3935 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
3936 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
3937 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
3939 buffer_FragColorbgra8[x*4+0] = d[0];
3940 buffer_FragColorbgra8[x*4+1] = d[1];
3941 buffer_FragColorbgra8[x*4+2] = d[2];
3942 buffer_FragColorbgra8[x*4+3] = d[3];
3945 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3947 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3948 for (x = startx;x < endx;x++)
3951 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3952 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3953 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3954 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3955 if (attenuation < 0.01f)
3957 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3959 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3960 if (attenuation < 0.01f)
3964 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3965 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3966 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3967 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3968 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3970 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3971 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3972 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3973 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3975 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3976 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3977 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3978 DPSOFTRAST_Vector3Normalize(surfacenormal);
3980 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3981 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3982 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3983 DPSOFTRAST_Vector3Normalize(lightnormal);
3985 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3986 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3988 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3989 attenuation *= (1.0f / 255.0f);
3990 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3991 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3992 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3993 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
3997 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
3998 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
3999 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4000 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4002 buffer_FragColorbgra8[x*4+0] = d[0];
4003 buffer_FragColorbgra8[x*4+1] = d[1];
4004 buffer_FragColorbgra8[x*4+2] = d[2];
4005 buffer_FragColorbgra8[x*4+3] = d[3];
4010 for (x = startx;x < endx;x++)
4013 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4014 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4015 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4016 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4017 if (attenuation < 0.01f)
4019 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4021 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4022 if (attenuation < 0.01f)
4026 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4027 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4028 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4029 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4030 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4032 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4033 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4034 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4035 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4037 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4039 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4040 attenuation *= (1.0f / 255.0f);
4041 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4042 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4043 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4044 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4048 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4049 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4050 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4051 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4053 buffer_FragColorbgra8[x*4+0] = d[0];
4054 buffer_FragColorbgra8[x*4+1] = d[1];
4055 buffer_FragColorbgra8[x*4+2] = d[2];
4056 buffer_FragColorbgra8[x*4+3] = d[3];
4059 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4065 void DPSOFTRAST_VertexShader_Refraction(void)
4067 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4070 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4073 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4074 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4075 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4076 memset(buffer_FragColorbgra8, 0, span->length*4);
4077 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4082 void DPSOFTRAST_VertexShader_Water(void)
4084 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4088 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4091 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4092 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4093 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4094 memset(buffer_FragColorbgra8, 0, span->length*4);
4095 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4100 void DPSOFTRAST_VertexShader_ShowDepth(void)
4102 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4105 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4108 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4109 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4110 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4111 memset(buffer_FragColorbgra8, 0, span->length*4);
4112 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4117 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4119 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4122 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4125 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4126 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4127 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4128 memset(buffer_FragColorbgra8, 0, span->length*4);
4129 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4134 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4136 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4139 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4142 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4143 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4144 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4145 memset(buffer_FragColorbgra8, 0, span->length*4);
4146 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4151 typedef struct DPSOFTRAST_ShaderModeInfo_s
4154 void (*Vertex)(void);
4155 void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4156 unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4157 unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4159 DPSOFTRAST_ShaderModeInfo;
4161 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4163 {2, DPSOFTRAST_VertexShader_Generic, DPSOFTRAST_PixelShader_Generic, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4164 {2, DPSOFTRAST_VertexShader_PostProcess, DPSOFTRAST_PixelShader_PostProcess, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4165 {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow, DPSOFTRAST_PixelShader_Depth_Or_Shadow, {~0}, {~0}},
4166 {2, DPSOFTRAST_VertexShader_FlatColor, DPSOFTRAST_PixelShader_FlatColor, {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4167 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4168 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4169 {2, DPSOFTRAST_VertexShader_FakeLight, DPSOFTRAST_PixelShader_FakeLight, {~0}, {~0}},
4170 {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace, DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4171 {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4172 {2, DPSOFTRAST_VertexShader_LightDirection, DPSOFTRAST_PixelShader_LightDirection, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4173 {2, DPSOFTRAST_VertexShader_LightSource, DPSOFTRAST_PixelShader_LightSource, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4174 {2, DPSOFTRAST_VertexShader_Refraction, DPSOFTRAST_PixelShader_Refraction, {~0}},
4175 {2, DPSOFTRAST_VertexShader_Water, DPSOFTRAST_PixelShader_Water, {~0}},
4176 {2, DPSOFTRAST_VertexShader_ShowDepth, DPSOFTRAST_PixelShader_ShowDepth, {~0}},
4177 {2, DPSOFTRAST_VertexShader_DeferredGeometry, DPSOFTRAST_PixelShader_DeferredGeometry, {~0}},
4178 {2, DPSOFTRAST_VertexShader_DeferredLightSource, DPSOFTRAST_PixelShader_DeferredLightSource, {~0}}
4181 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4188 // unsigned int *colorpixel;
4189 unsigned int *depthpixel;
4195 DPSOFTRAST_State_Triangle *triangle;
4196 DPSOFTRAST_State_Span *span;
4197 unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4198 for (i = 0; i < thread->numspans; i++)
4200 span = &thread->spans[i];
4201 triangle = &thread->triangles[span->triangle];
4202 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4204 wslope = triangle->w[0];
4205 w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4206 depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4207 depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4208 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4209 switch(thread->fb_depthfunc)
4212 case GL_ALWAYS: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = true; break;
4213 case GL_LESS: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4214 case GL_LEQUAL: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4215 case GL_EQUAL: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4216 case GL_GEQUAL: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4217 case GL_GREATER: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4218 case GL_NEVER: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = false; break;
4220 //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4221 //for (x = 0;x < span->length;x++)
4222 // colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4223 // if there is no color buffer, skip pixel shader
4225 endx = span->length;
4226 while (startx < endx && !pixelmask[startx])
4228 while (endx > startx && !pixelmask[endx-1])
4231 continue; // no pixels to fill
4232 span->pixelmask = pixelmask;
4233 span->startx = startx;
4235 // run pixel shader if appropriate
4236 // do this before running depthmask code, to allow the pixelshader
4237 // to clear pixelmask values for alpha testing
4238 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4239 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4240 if (thread->depthmask)
4241 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4247 // no depth testing means we're just dealing with color...
4248 // if there is no color buffer, skip pixel shader
4249 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4251 memset(pixelmask, 1, span->length);
4252 span->pixelmask = pixelmask;
4254 span->endx = span->length;
4255 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4259 thread->numspans = 0;
4262 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4264 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4267 int cullface = thread->cullface;
4268 int width = dpsoftrast.fb_width;
4269 int miny = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
4270 int maxy = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
4271 __m128i fbmin, fbmax;
4272 __m128 viewportcenter, viewportscale;
4273 int firstvertex = command->firstvertex;
4274 int numvertices = command->numvertices;
4275 int numtriangles = command->numtriangles;
4276 const int *element3i = command->element3i;
4277 const unsigned short *element3s = command->element3s;
4278 int clipped = command->clipped;
4289 __m128 triangleedge1, triangleedge2, trianglenormal;
4292 DPSOFTRAST_State_Triangle *triangle;
4293 DPSOFTRAST_Texture *texture;
4294 if (command->starty >= maxy || command->endy <= miny)
4296 if (!ATOMIC_DECREMENT(command->refcount))
4298 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4299 MM_FREE(command->arrays);
4303 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4304 fbmin = _mm_setr_epi16(0, miny, 0, miny, 0, miny, 0, miny);
4305 fbmax = _mm_sub_epi16(_mm_setr_epi16(width, maxy, width, maxy, width, maxy, width, maxy), _mm_set1_epi16(1));
4306 viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4307 viewportscale = _mm_load_ps(thread->fb_viewportscale);
4308 screen[3] = _mm_setzero_ps();
4309 clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4310 for (i = 0;i < numtriangles;i++)
4312 const float *screencoord4f = command->arrays;
4313 const float *arrays = screencoord4f + numvertices*4;
4315 // generate the 3 edges of this triangle
4316 // generate spans for the triangle - switch based on left split or right split classification of triangle
4319 e[0] = element3s[i*3+0] - firstvertex;
4320 e[1] = element3s[i*3+1] - firstvertex;
4321 e[2] = element3s[i*3+2] - firstvertex;
4325 e[0] = element3i[i*3+0] - firstvertex;
4326 e[1] = element3i[i*3+1] - firstvertex;
4327 e[2] = element3i[i*3+2] - firstvertex;
4336 #define SKIPBACKFACE \
4337 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4338 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4339 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4340 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4341 _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4345 if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4349 if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4354 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4355 clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4357 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4358 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4360 #define CLIPPEDVERTEXCOPY(k,p1) \
4361 screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4363 #define GENATTRIBCOPY(attrib, p1) \
4364 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4365 #define GENATTRIBLERP(attrib, p1, p2) \
4367 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4368 attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4370 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4374 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4375 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4376 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4377 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4378 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4379 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4380 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4386 // calculate distance from nearplane
4387 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4388 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4389 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4390 if (clipdist[0] >= 0.0f)
4392 if (clipdist[1] >= 0.0f)
4394 if (clipdist[2] >= 0.0f)
4397 // triangle is entirely in front of nearplane
4398 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4405 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4413 if (clipdist[2] >= 0.0f)
4415 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4422 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4429 else if (clipdist[1] >= 0.0f)
4431 if (clipdist[2] >= 0.0f)
4433 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4440 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4446 else if (clipdist[2] >= 0.0f)
4448 CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4453 else continue; // triangle is entirely behind nearplane
4456 // calculate integer y coords for triangle points
4457 __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4458 screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4459 screenmin = _mm_min_epi16(screeni, screenir),
4460 screenmax = _mm_max_epi16(screeni, screenir);
4461 screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4462 screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4463 screenmin = _mm_max_epi16(screenmin, fbmin);
4464 screenmax = _mm_min_epi16(screenmax, fbmax);
4465 // skip offscreen triangles
4466 if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4468 starty = _mm_extract_epi16(screenmin, 1);
4469 endy = _mm_extract_epi16(screenmax, 1)+1;
4470 screeny = _mm_srai_epi32(screeni, 16);
4473 triangle = &thread->triangles[thread->numtriangles];
4475 // calculate attribute plans for triangle data...
4476 // okay, this triangle is going to produce spans, we'd better project
4477 // the interpolants now (this is what gives perspective texturing),
4478 // this consists of simply multiplying all arrays by the W coord
4479 // (which is basically 1/Z), which will be undone per-pixel
4480 // (multiplying by Z again) to get the perspective-correct array
4483 __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4484 __m128 mipedgescale, mipdensity;
4485 attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4486 attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4487 attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4488 attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4489 attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4490 w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4491 w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4492 w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4493 attribedge1 = _mm_sub_ss(w0, w1);
4494 attribedge2 = _mm_sub_ss(w2, w1);
4495 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4496 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4497 x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4498 y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4499 attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4500 _mm_store_ss(&triangle->w[0], attribxslope);
4501 _mm_store_ss(&triangle->w[1], attribyslope);
4502 _mm_store_ss(&triangle->w[2], attriborigin);
4503 mipedgescale = _mm_setzero_ps();
4504 for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4506 __m128 attrib0, attrib1, attrib2;
4507 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4508 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4510 arrays += numvertices*4;
4511 GENATTRIBS(attrib0, attrib1, attrib2);
4512 attriborigin = _mm_mul_ps(attrib1, w1);
4513 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4514 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4515 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4516 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4517 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4518 _mm_stream_ps(triangle->attribs[k][0], attribxslope);
4519 _mm_stream_ps(triangle->attribs[k][1], attribyslope);
4520 _mm_stream_ps(triangle->attribs[k][2], attriborigin);
4521 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4523 mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4524 mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4525 mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4526 mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4530 memset(triangle->mip, 0, sizeof(triangle->mip));
4531 for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4533 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4534 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4536 texture = thread->texbound[texunit];
4537 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4539 mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4540 mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4541 mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4542 mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4543 // this will be multiplied in the texturing routine by the texture resolution
4544 y = _mm_cvtss_si32(mipdensity);
4547 y = (int)(log((float)y)*0.5f/M_LN2);
4548 if (y > texture->mipmaps - 1)
4549 y = texture->mipmaps - 1;
4550 triangle->mip[texunit] = y;
4556 for (y = starty; y < endy;)
4558 __m128 xcoords, xslope;
4559 __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
4560 int yccmask = _mm_movemask_epi8(ycc);
4561 int edge0p, edge0n, edge1p, edge1n;
4568 case 0xFFFF: /*0000*/ y = endy; continue;
4569 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
4570 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4571 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
4572 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
4573 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
4574 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
4575 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
4576 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
4577 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
4578 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
4579 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
4580 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
4581 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4582 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
4583 case 0x0000: /*1111*/ y++; continue;
4591 case 0xFFFF: /*000*/ y = endy; continue;
4592 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
4593 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4594 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
4595 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
4596 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4597 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
4598 case 0x0000: /*111*/ y++; continue;
4601 ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
4602 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
4603 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
4604 nexty = _mm_extract_epi16(ycc, 0);
4605 if(nexty >= endy) nexty = endy-1;
4606 if (_mm_ucomigt_ss(_mm_max_ss(screen[edge0n], screen[edge0p]), _mm_min_ss(screen[edge1n], screen[edge1p])))
4615 xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
4616 xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
4617 xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
4618 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
4619 xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
4620 for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
4622 int startx, endx, offset;
4623 startx = _mm_cvtss_si32(xcoords);
4624 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
4625 if (startx < 0) startx = 0;
4626 if (endx > dpsoftrast.fb_width) endx = dpsoftrast.fb_width;
4627 if (startx >= endx) continue;
4628 for (offset = startx; offset < endx;)
4630 DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
4631 span->triangle = thread->numtriangles;
4634 span->length = endx - offset;
4635 if (span -> length > DPSOFTRAST_DRAW_MAXSPANLENGTH)
4636 span -> length = DPSOFTRAST_DRAW_MAXSPANLENGTH;
4637 offset += span->length;
4638 if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
4639 DPSOFTRAST_Draw_ProcessSpans(thread);
4644 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
4646 DPSOFTRAST_Draw_ProcessSpans(thread);
4647 thread->numtriangles = 0;
4651 if (!ATOMIC_DECREMENT(command->refcount))
4653 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4654 MM_FREE(command->arrays);
4657 if (thread->numspans > 0 || thread->numtriangles > 0)
4659 DPSOFTRAST_Draw_ProcessSpans(thread);
4660 thread->numtriangles = 0;
4665 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4669 int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
4670 int datasize = 2*numvertices*sizeof(float[4]);
4671 DPSOFTRAST_Command_Draw *command;
4672 unsigned char *data;
4673 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4675 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4676 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4678 datasize += numvertices*sizeof(float[4]);
4681 datasize += numtriangles*sizeof(unsigned short[3]);
4683 datasize += numtriangles*sizeof(int[3]);
4684 datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
4685 if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
4687 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
4688 data = (unsigned char *)MM_CALLOC(datasize, 1);
4692 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
4693 data = (unsigned char *)command + commandsize;
4695 command->firstvertex = firstvertex;
4696 command->numvertices = numvertices;
4697 command->numtriangles = numtriangles;
4698 command->arrays = (float *)data;
4699 memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
4700 dpsoftrast.firstvertex = firstvertex;
4701 dpsoftrast.numvertices = numvertices;
4702 dpsoftrast.screencoord4f = (float *)data;
4703 data += numvertices*sizeof(float[4]);
4704 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
4705 data += numvertices*sizeof(float[4]);
4706 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4708 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4709 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4711 dpsoftrast.post_array4f[j] = (float *)data;
4712 data += numvertices*sizeof(float[4]);
4714 command->element3i = NULL;
4715 command->element3s = NULL;
4718 command->element3s = (unsigned short *)data;
4719 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
4723 command->element3i = (int *)data;
4724 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
4729 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4731 DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
4732 DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
4733 command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
4734 command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
4735 if (command->starty >= command->endy)
4737 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4738 MM_FREE(command->arrays);
4739 DPSOFTRAST_UndoCommand(command->commandsize);
4742 command->clipped = dpsoftrast.drawclipped;
4743 command->refcount = dpsoftrast.numthreads;
4746 DPSOFTRAST_Draw_SyncCommands();
4750 for (i = 0; i < dpsoftrast.numthreads; i++)
4752 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
4754 nexty = ((i+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
4755 if (command->starty < nexty && command->endy > y && thread->starving)
4756 SDL_CondSignal(thread->drawcond);
4760 DPSOFTRAST_Draw_FlushThreads();
4764 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
4766 int commandoffset = thread->commandoffset;
4767 while (commandoffset != endoffset)
4769 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
4770 switch (command->opcode)
4772 #define INTERPCOMMAND(name) \
4773 case DPSOFTRAST_OPCODE_##name : \
4774 DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
4775 commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
4776 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
4777 commandoffset = 0; \
4779 INTERPCOMMAND(Viewport)
4780 INTERPCOMMAND(ClearColor)
4781 INTERPCOMMAND(ClearDepth)
4782 INTERPCOMMAND(ColorMask)
4783 INTERPCOMMAND(DepthTest)
4784 INTERPCOMMAND(ScissorTest)
4785 INTERPCOMMAND(Scissor)
4786 INTERPCOMMAND(BlendFunc)
4787 INTERPCOMMAND(BlendSubtract)
4788 INTERPCOMMAND(DepthMask)
4789 INTERPCOMMAND(DepthFunc)
4790 INTERPCOMMAND(DepthRange)
4791 INTERPCOMMAND(PolygonOffset)
4792 INTERPCOMMAND(CullFace)
4793 INTERPCOMMAND(AlphaTest)
4794 INTERPCOMMAND(AlphaFunc)
4795 INTERPCOMMAND(SetTexture)
4796 INTERPCOMMAND(SetShader)
4797 INTERPCOMMAND(Uniform4f)
4798 INTERPCOMMAND(UniformMatrix4f)
4799 INTERPCOMMAND(Uniform1i)
4801 case DPSOFTRAST_OPCODE_Draw:
4802 DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
4803 commandoffset += command->commandsize;
4804 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
4806 thread->commandoffset = commandoffset;
4809 case DPSOFTRAST_OPCODE_Reset:
4814 thread->commandoffset = commandoffset;
4818 static int DPSOFTRAST_Draw_Thread(void *data)
4820 DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
4821 while(thread->index >= 0)
4823 if (thread->commandoffset != dpsoftrast.drawcommand)
4825 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
4829 SDL_LockMutex(thread->drawmutex);
4830 if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
4832 if (thread->waiting) SDL_CondSignal(thread->waitcond);
4833 thread->starving = true;
4834 SDL_CondWait(thread->drawcond, thread->drawmutex);
4835 thread->starving = false;
4837 SDL_UnlockMutex(thread->drawmutex);
4844 static void DPSOFTRAST_Draw_FlushThreads(void)
4846 DPSOFTRAST_State_Thread *thread;
4848 DPSOFTRAST_Draw_SyncCommands();
4850 for (i = 0; i < dpsoftrast.numthreads; i++)
4852 thread = &dpsoftrast.threads[i];
4853 if (thread->commandoffset != dpsoftrast.drawcommand)
4855 SDL_LockMutex(thread->drawmutex);
4856 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
4857 SDL_CondSignal(thread->drawcond);
4858 SDL_UnlockMutex(thread->drawmutex);
4862 for (i = 0; i < dpsoftrast.numthreads; i++)
4864 thread = &dpsoftrast.threads[i];
4866 if (thread->commandoffset != dpsoftrast.drawcommand)
4868 SDL_LockMutex(thread->drawmutex);
4869 if (thread->commandoffset != dpsoftrast.drawcommand)
4871 thread->waiting = true;
4872 SDL_CondWait(thread->waitcond, thread->drawmutex);
4873 thread->waiting = false;
4875 SDL_UnlockMutex(thread->drawmutex);
4878 if (thread->commandoffset != dpsoftrast.drawcommand)
4879 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
4882 dpsoftrast.commandpool.usedcommands = 0;
4885 void DPSOFTRAST_Flush(void)
4887 DPSOFTRAST_Draw_FlushThreads();
4890 void DPSOFTRAST_Finish(void)
4895 void DPSOFTRAST_Init(int width, int height, int numthreads, unsigned int *colorpixels, unsigned int *depthpixels)
4905 memset(&dpsoftrast, 0, sizeof(dpsoftrast));
4906 dpsoftrast.bigendian = u.b[3];
4907 dpsoftrast.fb_width = width;
4908 dpsoftrast.fb_height = height;
4909 dpsoftrast.fb_depthpixels = depthpixels;
4910 dpsoftrast.fb_colorpixels[0] = colorpixels;
4911 dpsoftrast.fb_colorpixels[1] = NULL;
4912 dpsoftrast.fb_colorpixels[1] = NULL;
4913 dpsoftrast.fb_colorpixels[1] = NULL;
4914 dpsoftrast.viewport[0] = 0;
4915 dpsoftrast.viewport[1] = 0;
4916 dpsoftrast.viewport[2] = dpsoftrast.fb_width;
4917 dpsoftrast.viewport[3] = dpsoftrast.fb_height;
4918 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
4919 dpsoftrast.texture_firstfree = 1;
4920 dpsoftrast.texture_end = 1;
4921 dpsoftrast.texture_max = 0;
4922 dpsoftrast.color[0] = 1;
4923 dpsoftrast.color[1] = 1;
4924 dpsoftrast.color[2] = 1;
4925 dpsoftrast.color[3] = 1;
4927 dpsoftrast.numthreads = bound(1, numthreads, 64);
4929 dpsoftrast.numthreads = 1;
4931 dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
4932 for (i = 0; i < dpsoftrast.numthreads; i++)
4934 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
4936 thread->cullface = GL_BACK;
4937 thread->colormask[1] = 1;
4938 thread->colormask[2] = 1;
4939 thread->colormask[3] = 1;
4940 thread->blendfunc[0] = GL_ONE;
4941 thread->blendfunc[1] = GL_ZERO;
4942 thread->depthmask = true;
4943 thread->depthtest = true;
4944 thread->depthfunc = GL_LEQUAL;
4945 thread->scissortest = false;
4946 thread->alphatest = false;
4947 thread->alphafunc = GL_GREATER;
4948 thread->alphavalue = 0.5f;
4949 thread->viewport[0] = 0;
4950 thread->viewport[1] = 0;
4951 thread->viewport[2] = dpsoftrast.fb_width;
4952 thread->viewport[3] = dpsoftrast.fb_height;
4953 thread->scissor[0] = 0;
4954 thread->scissor[1] = 0;
4955 thread->scissor[2] = dpsoftrast.fb_width;
4956 thread->scissor[3] = dpsoftrast.fb_height;
4957 thread->depthrange[0] = 0;
4958 thread->depthrange[1] = 1;
4959 thread->polygonoffset[0] = 0;
4960 thread->polygonoffset[1] = 0;
4962 thread->numspans = 0;
4963 thread->numtriangles = 0;
4964 thread->commandoffset = 0;
4965 thread->waiting = false;
4966 thread->starving = false;
4968 thread->waitcond = SDL_CreateCond();
4969 thread->drawcond = SDL_CreateCond();
4970 thread->drawmutex = SDL_CreateMutex();
4973 thread->validate = -1;
4974 DPSOFTRAST_Validate(thread, -1);
4976 thread->thread = SDL_CreateThread(DPSOFTRAST_Draw_Thread, thread);
4981 void DPSOFTRAST_Shutdown(void)
4985 if(dpsoftrast.numthreads > 0)
4987 DPSOFTRAST_State_Thread *thread;
4988 for (i = 0; i < dpsoftrast.numthreads; i++)
4990 thread = &dpsoftrast.threads[i];
4991 SDL_LockMutex(thread->drawmutex);
4993 SDL_CondSignal(thread->drawcond);
4994 SDL_UnlockMutex(thread->drawmutex);
4995 SDL_WaitThread(thread->thread, NULL);
4996 SDL_DestroyCond(thread->waitcond);
4997 SDL_DestroyCond(thread->drawcond);
4998 SDL_DestroyMutex(thread->drawmutex);
5002 for (i = 0;i < dpsoftrast.texture_end;i++)
5003 if (dpsoftrast.texture[i].bytes)
5004 MM_FREE(dpsoftrast.texture[i].bytes);
5005 if (dpsoftrast.texture)
5006 free(dpsoftrast.texture);
5007 if (dpsoftrast.threads)
5008 MM_FREE(dpsoftrast.threads);
5009 memset(&dpsoftrast, 0, sizeof(dpsoftrast));