#define _USE_MATH_DEFINES
#include <math.h>
#include "quakedef.h"
+#include "thread.h"
#include "dpsoftrast.h"
-#ifdef USE_SDL
-//#define USE_THREADS
-#endif
-
-#ifdef USE_THREADS
-#include <SDL.h>
-#include <SDL_thread.h>
-#endif
-
#ifndef __cplusplus
typedef qboolean bool;
#endif
#if defined(__GNUC__)
#define ALIGN(var) var __attribute__((__aligned__(16)))
#define ATOMIC(var) var __attribute__((__aligned__(32)))
- #ifdef USE_THREADS
- #define MEMORY_BARRIER (_mm_sfence())
- //(__sync_synchronize())
- #define ATOMIC_COUNTER volatile int
- #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
- #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
- #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
- #endif
+ #define MEMORY_BARRIER (_mm_sfence())
+ //(__sync_synchronize())
+ #define ATOMIC_COUNTER volatile int
+ #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
+ #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
+ #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
#elif defined(_MSC_VER)
#define ALIGN(var) __declspec(align(16)) var
#define ATOMIC(var) __declspec(align(32)) var
- #ifdef USE_THREADS
- #define MEMORY_BARRIER (_mm_sfence())
- //(MemoryBarrier())
- #define ATOMIC_COUNTER volatile LONG
- #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
- #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
- #define ATOMIC_ADD(counter, val) (InterlockedExchangeAdd(&(counter), (val)))
- #endif
- #else
- #undef USE_THREADS
- #undef SSE2_PRESENT
+ #define MEMORY_BARRIER (_mm_sfence())
+ //(MemoryBarrier())
+ #define ATOMIC_COUNTER volatile LONG
+ #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
+ #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
+ #define ATOMIC_ADD(counter, val) (InterlockedExchangeAdd(&(counter), (val)))
#endif
#endif
-#ifndef SSE2_PRESENT
- #define ALIGN(var) var
- #define ATOMIC(var) var
+#ifndef ALIGN
+#define ALIGN(var) var
#endif
-
-#ifndef USE_THREADS
- #define MEMORY_BARRIER ((void)0)
- #define ATOMIC_COUNTER int
- #define ATOMIC_INCREMENT(counter) (++(counter))
- #define ATOMIC_DECREMENT(counter) (--(counter))
- #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
+#ifndef ATOMIC
+#define ATOMIC(var) var
+#endif
+#ifndef MEMORY_BARRIER
+#define MEMORY_BARRIER ((void)0)
+#endif
+#ifndef ATOMIC_COUNTER
+#define ATOMIC_COUNTER int
+#endif
+#ifndef ATOMIC_INCREMENT
+#define ATOMIC_INCREMENT(counter) (++(counter))
+#endif
+#ifndef ATOMIC_DECREMENT
+#define ATOMIC_DECREMENT(counter) (--(counter))
+#endif
+#ifndef ATOMIC_ADD
+#define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
#endif
#ifdef SSE2_PRESENT
static void *MM_CALLOC(size_t nmemb, size_t size)
{
void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
- if(ptr != NULL) memset(ptr, 0, nmemb*size);
+ if (ptr != NULL) memset(ptr, 0, nmemb*size);
return ptr;
}
int triangle; // triangle this span was generated by
int x; // framebuffer x coord
int y; // framebuffer y coord
- int length; // pixel count
int startx; // usable range (according to pixelmask)
int endx; // usable range (according to pixelmask)
unsigned char *pixelmask; // true for pixels that passed depth test, false for others
DPSOFTRAST_BLENDMODE_MUL2,
DPSOFTRAST_BLENDMODE_SUBALPHA,
DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
+ DPSOFTRAST_BLENDMODE_INVADD,
DPSOFTRAST_BLENDMODE_TOTAL
}
DPSOFTRAST_BLENDMODE;
typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
{
-#ifdef USE_THREADS
- SDL_Thread *thread;
-#endif
+ void *thread;
int index;
int cullface;
// derived values (DPSOFTRAST_VALIDATE_FB)
int fb_colormask;
- int fb_clearscissor[4];
+ int fb_scissor[4];
ALIGN(float fb_viewportcenter[4]);
ALIGN(float fb_viewportscale[4]);
// derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
int fb_blendmode;
+ // band boundaries
+ int miny1;
+ int maxy1;
+ int miny2;
+ int maxy2;
+
ATOMIC(volatile int commandoffset);
volatile bool waiting;
volatile bool starving;
-#ifdef USE_THREADS
- SDL_cond *waitcond;
- SDL_cond *drawcond;
- SDL_mutex *drawmutex;
-#endif
+ void *waitcond;
+ void *drawcond;
+ void *drawmutex;
int numspans;
int numtriangles;
// error reporting
const char *errorstring;
+ bool usethreads;
+ int interlace;
int numthreads;
DPSOFTRAST_State_Thread *threads;
if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
if (y1 < 0) y1 = 0;
if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
- thread->fb_clearscissor[0] = x1;
- thread->fb_clearscissor[1] = y1;
- thread->fb_clearscissor[2] = x2 - x1;
- thread->fb_clearscissor[3] = y2 - y1;
+ thread->fb_scissor[0] = x1;
+ thread->fb_scissor[1] = y1;
+ thread->fb_scissor[2] = x2 - x1;
+ thread->fb_scissor[3] = y2 - y1;
DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
}
{
#define BLENDFUNC(sfactor, dfactor, blendmode) \
case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
- BLENDFUNC(GL_SRC_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
+ BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
}
}
BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
- BLENDFUNC(GL_SRC_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
+ BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
}
}
dpsoftrast.texture_max *= 2;
dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
- if(dpsoftrast.texbound[i])
+ if (dpsoftrast.texbound[i])
dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
for (j = 0; j < dpsoftrast.numthreads; j++)
{
thread = &dpsoftrast.threads[j];
for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
- if(thread->texbound[i])
+ if (thread->texbound[i])
thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
}
}
static void DPSOFTRAST_Draw_SyncCommands(void)
{
- MEMORY_BARRIER;
+ if(dpsoftrast.usethreads) MEMORY_BARRIER;
dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
}
static void DPSOFTRAST_Draw_FreeCommandPool(int space)
{
-#ifdef USE_THREADS
DPSOFTRAST_State_Thread *thread;
int i;
int freecommand = dpsoftrast.commandpool.freecommand;
if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
break;
thread = &dpsoftrast.threads[waitindex];
- SDL_LockMutex(thread->drawmutex);
+ Thread_LockMutex(thread->drawmutex);
if (thread->commandoffset != dpsoftrast.drawcommand)
{
thread->waiting = true;
- if (thread->starving) SDL_CondSignal(thread->drawcond);
- SDL_CondWait(thread->waitcond, thread->drawmutex);
+ if (thread->starving) Thread_CondSignal(thread->drawcond);
+ Thread_CondWait(thread->waitcond, thread->drawmutex);
thread->waiting = false;
}
- SDL_UnlockMutex(thread->drawmutex);
+ Thread_UnlockMutex(thread->drawmutex);
}
dpsoftrast.commandpool.usedcommands = usedcommands;
-#else
- DPSOFTRAST_Draw_FlushThreads();
-#endif
}
#define DPSOFTRAST_ALIGNCOMMAND(size) \
int freecommand = dpsoftrast.commandpool.freecommand;
int usedcommands = dpsoftrast.commandpool.usedcommands;
int extra = sizeof(DPSOFTRAST_Command);
- if(DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
+ if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
- if(usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
+ if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
{
- DPSOFTRAST_Draw_FreeCommandPool(size + extra);
+ if (dpsoftrast.usethreads)
+ DPSOFTRAST_Draw_FreeCommandPool(size + extra);
+ else
+ DPSOFTRAST_Draw_FlushThreads();
freecommand = dpsoftrast.commandpool.freecommand;
usedcommands = dpsoftrast.commandpool.usedcommands;
}
- if(DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
+ if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
{
command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
command->opcode = DPSOFTRAST_OPCODE_Reset;
int freecommand = dpsoftrast.commandpool.freecommand;
int usedcommands = dpsoftrast.commandpool.usedcommands;
freecommand -= size;
+ if (freecommand < 0)
+ freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
usedcommands -= size;
dpsoftrast.commandpool.freecommand = freecommand;
dpsoftrast.commandpool.usedcommands = usedcommands;
DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;)
static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
{
- int i, x1, y1, x2, y2, w, h, x, y, t1, t2;
+ int i, x1, y1, x2, y2, w, h, x, y;
+ int miny1 = thread->miny1;
+ int maxy1 = thread->maxy1;
+ int miny2 = thread->miny2;
+ int maxy2 = thread->maxy2;
+ int bandy;
unsigned int *p;
unsigned int c;
DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
- x1 = thread->fb_clearscissor[0];
- y1 = thread->fb_clearscissor[1];
- x2 = thread->fb_clearscissor[0] + thread->fb_clearscissor[2];
- y2 = thread->fb_clearscissor[1] + thread->fb_clearscissor[3];
- t1 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
- t2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
- if(y1 < t1) y1 = t1;
- if(y2 > t2) y2 = t2;
+ x1 = thread->fb_scissor[0];
+ y1 = thread->fb_scissor[1];
+ x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
+ y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
+ if (y1 < miny1) y1 = miny1;
+ if (y2 > maxy2) y2 = maxy2;
w = x2 - x1;
h = y2 - y1;
if (w < 1 || h < 1)
{
if (!dpsoftrast.fb_colorpixels[i])
continue;
- for (y = y1;y < y2;y++)
+ for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
+ for (;y < bandy;y++)
{
p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
for (x = x1;x < x2;x++)
DEFCOMMAND(3, ClearDepth, float depth;)
static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
{
- int x1, y1, x2, y2, w, h, x, y, t1, t2;
+ int x1, y1, x2, y2, w, h, x, y;
+ int miny1 = thread->miny1;
+ int maxy1 = thread->maxy1;
+ int miny2 = thread->miny2;
+ int maxy2 = thread->maxy2;
+ int bandy;
unsigned int *p;
unsigned int c;
DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
- x1 = thread->fb_clearscissor[0];
- y1 = thread->fb_clearscissor[1];
- x2 = thread->fb_clearscissor[0] + thread->fb_clearscissor[2];
- y2 = thread->fb_clearscissor[1] + thread->fb_clearscissor[3];
- t1 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
- t2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
- if(y1 < t1) y1 = t1;
- if(y2 > t2) y2 = t2;
+ x1 = thread->fb_scissor[0];
+ y1 = thread->fb_scissor[1];
+ x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
+ y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
+ if (y1 < miny1) y1 = miny1;
+ if (y2 > maxy2) y2 = maxy2;
w = x2 - x1;
h = y2 - y1;
if (w < 1 || h < 1)
return;
c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
- for (y = y1;y < y2;y++)
+ for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
+ for (;y < bandy;y++)
{
p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
for (x = x1;x < x2;x++)
int bx2 = blockx + blockwidth;
int by2 = blocky + blockheight;
int bw;
- int bh;
int x;
int y;
unsigned char *inpixels;
if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
bw = bx2 - bx1;
- bh = by2 - by1;
inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
if (dpsoftrast.bigendian)
{
DPSOFTRAST_Texture *texture;
texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
if (mip < 0 || mip >= texture->mipmaps) return;
- if (texture->binds)
- DPSOFTRAST_Flush();
+ DPSOFTRAST_Flush();
spixels = dpsoftrast.fb_colorpixels[0];
swidth = dpsoftrast.fb_width;
sheight = dpsoftrast.fb_height;
{
__m128 m0, m1, m2, m3;
DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
- command->index = index;
+ command->index = (DPSOFTRAST_UNIFORM)index;
if (((size_t)v)&(ALIGN_SIZE-1))
{
m0 = _mm_loadu_ps(v);
*endy = _mm_cvttss_si32(minproj)+1;
return clipmask;
}
-#endif
static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
{
-#ifdef SSE2_PRESENT
float *end = out4f + numitems*4;
__m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
__m128 minpos, maxpos;
_mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f),
_mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f));
return 0;
-#endif
}
static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
{
-#ifdef SSE2_PRESENT
static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
__m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
float *end;
if (starty && endy)
return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, m0, m1, m2, m3);
return 0;
-#endif
}
+#endif
static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
{
+#ifdef SSE2_PRESENT
float *outf = dpsoftrast.post_array4f[outarray];
const unsigned char *inb;
int firstvertex = dpsoftrast.firstvertex;
break;
}
return outf;
+#else
+ return NULL;
+#endif
}
static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
#if 0
static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
{
+#ifdef SSE2_PRESENT
float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
return data;
+#else
+ return NULL;
+#endif
}
#endif
static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
{
+#ifdef SSE2_PRESENT
float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
return data;
+#else
+ return NULL;
+#endif
}
void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
{
int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
float z = endz, dz;
- if(nextsub >= endx) nextsub = endsub = endx-1;
+ if (nextsub >= endx) nextsub = endsub = endx-1;
endz = 1.0f / (w + wslope * nextsub);
dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
for (; x <= endsub; x++, z += dz)
pixel[x*4+3] = d[3];
}
break;
+ case DPSOFTRAST_BLENDMODE_INVADD:
+ for (x = startx;x < endx;x++)
+ {
+ if (!pixelmask[x])
+ continue;
+ d[0] = (int)((255.0f-pixel[x*4+2])*in4f[x*4+0] + pixel[x*4+2]);if (d[0] > 255) d[0] = 255;
+ d[1] = (int)((255.0f-pixel[x*4+1])*in4f[x*4+1] + pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
+ d[2] = (int)((255.0f-pixel[x*4+0])*in4f[x*4+2] + pixel[x*4+0]);if (d[2] > 255) d[2] = 255;
+ d[3] = (int)((255.0f-pixel[x*4+3])*in4f[x*4+3] + pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
+ pixel[x*4+0] = d[0];
+ pixel[x*4+1] = d[1];
+ pixel[x*4+2] = d[2];
+ pixel[x*4+3] = d[3];
+ }
+ break;
}
}
break;
case DPSOFTRAST_BLENDMODE_ALPHA:
#define FINISHBLEND(blend2, blend1) \
- for (x = startx;x + 2 <= endx;x += 2) \
+ for (x = startx;x + 1 < endx;x += 2) \
{ \
__m128i src, dst; \
switch (*(const unsigned short*)&pixelmask[x]) \
dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
});
break;
+ case DPSOFTRAST_BLENDMODE_INVADD:
+ FINISHBLEND({
+ dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
+ }, {
+ dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
+ });
+ break;
}
#endif
}
unsigned int substep[2];
float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
- if(nextsub >= endx)
+ if (nextsub >= endx)
{
nextsub = endsub = endx-1;
- if(x < nextsub) subscale = 65536.0f / (nextsub - x);
+ if (x < nextsub) subscale = 65536.0f / (nextsub - x);
}
tc[0] = endtc[0];
tc[1] = endtc[1];
substep[1] = (endtc[1] - tc[1]) * subscale;
subtc[0] = tc[0] * (1<<16);
subtc[1] = tc[1] * (1<<16);
- if(filter)
+ if (filter)
{
if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
{
// if no texture is bound, just fill it with white
if (!texture)
{
- memset(out4ub + startx*4, 255, span->length*4);
+ memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
return;
}
mip = triangle->mip[texunitindex];
{
int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
__m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
- if(nextsub >= endx)
+ if (nextsub >= endx)
{
nextsub = endsub = endx-1;
- if(x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
+ if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
}
tc = endtc;
subtc = endsubtc;
void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
{
// TODO: IMPLEMENT
- memset(out4ub, 255, span->length*4);
+ memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
}
float DPSOFTRAST_SampleShadowmap(const float *vector)
{
int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
__m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
- if(nextsub >= endx)
+ if (nextsub >= endx)
{
nextsub = endsub = endx-1;
- if(x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
+ if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
}
mod = endmod;
submod = endsubmod;
{
int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
__m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
- if(nextsub >= endx)
+ if (nextsub >= endx)
{
nextsub = endsub = endx-1;
- if(x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
+ if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
}
mod = endmod;
submod = endsubmod;
pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
_mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
}
- if(x < endx)
+ if (x < endx)
{
__m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
__m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
pix1 = _mm_mulhi_epu16(pix1, pix2);
_mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
}
- if(x < endx)
+ if (x < endx)
{
__m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
__m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
pix1 = _mm_add_epi16(pix1, pix2);
_mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
}
- if(x < endx)
+ if (x < endx)
{
__m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
__m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
_mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
}
- if(x < endx)
+ if (x < endx)
{
__m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
__m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
_mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
}
- if(x < endx)
+ if (x < endx)
{
__m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
__m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
_mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
}
- if(x < endx)
+ if (x < endx)
{
__m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
- memset(buffer_FragColorbgra8, 0, span->length*4);
+ memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
}
void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
{
+#ifdef SSE2_PRESENT
+ unsigned char * RESTRICT pixelmask = span->pixelmask;
+ unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
int x, startx = span->startx, endx = span->endx;
- int Color_Ambienti[4];
+ __m128i Color_Ambientm;
float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
- Color_Ambienti[2] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0]*256.0f);
- Color_Ambienti[1] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1]*256.0f);
- Color_Ambienti[0] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2]*256.0f);
- Color_Ambienti[3] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0] *256.0f);
DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
+ if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
+ pixel = buffer_FragColorbgra8;
+ Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
+ Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
+ Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
+ Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
for (x = startx;x < endx;x++)
{
- buffer_FragColorbgra8[x*4+0] = (buffer_texture_colorbgra8[x*4+0] * Color_Ambienti[0])>>8;
- buffer_FragColorbgra8[x*4+1] = (buffer_texture_colorbgra8[x*4+1] * Color_Ambienti[1])>>8;
- buffer_FragColorbgra8[x*4+2] = (buffer_texture_colorbgra8[x*4+2] * Color_Ambienti[2])>>8;
- buffer_FragColorbgra8[x*4+3] = (buffer_texture_colorbgra8[x*4+3] * Color_Ambienti[3])>>8;
+ __m128i color, pix;
+ if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
+ {
+ __m128i pix2;
+ color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
+ pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
+ pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
+ _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
+ x += 3;
+ continue;
+ }
+ if (!pixelmask[x])
+ continue;
+ color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
+ pix = _mm_mulhi_epu16(Color_Ambientm, color);
+ *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
}
- DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
+ if (pixel == buffer_FragColorbgra8)
+ DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
+#endif
}
x += 3;
continue;
}
- if(!pixelmask[x])
+ if (!pixelmask[x])
continue;
color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x])));
pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
*(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
}
- if(pixel == buffer_FragColorbgra8)
+ if (pixel == buffer_FragColorbgra8)
DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
#endif
}
x += 3;
continue;
}
- if(!pixelmask[x])
+ if (!pixelmask[x])
continue;
color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
x += 3;
continue;
}
- if(!pixelmask[x])
+ if (!pixelmask[x])
continue;
color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
*(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
}
}
- if(pixel == buffer_FragColorbgra8)
+ if (pixel == buffer_FragColorbgra8)
DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
#endif
}
float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
- memset(buffer_FragColorbgra8, 0, span->length*4);
+ memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
}
float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
- memset(buffer_FragColorbgra8, 0, span->length*4);
+ memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
}
float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
- memset(buffer_FragColorbgra8, 0, span->length*4);
+ memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
}
float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
- memset(buffer_FragColorbgra8, 0, span->length*4);
+ memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
}
float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
- memset(buffer_FragColorbgra8, 0, span->length*4);
+ memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
}
float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
- memset(buffer_FragColorbgra8, 0, span->length*4);
+ memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
}
depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
+ startx = span->startx;
+ endx = span->endx;
switch(thread->fb_depthfunc)
{
default:
- case GL_ALWAYS: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = true; break;
- case GL_LESS: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
- case GL_LEQUAL: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
- case GL_EQUAL: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
- case GL_GEQUAL: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
- case GL_GREATER: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
- case GL_NEVER: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = false; break;
+ case GL_ALWAYS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
+ case GL_LESS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
+ case GL_LEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
+ case GL_EQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
+ case GL_GEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
+ case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
+ case GL_NEVER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
}
//colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
- //for (x = 0;x < span->length;x++)
+ //for (x = startx;x < endx;x++)
// colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
// if there is no color buffer, skip pixel shader
- startx = 0;
- endx = span->length;
while (startx < endx && !pixelmask[startx])
startx++;
while (endx > startx && !pixelmask[endx-1])
// if there is no color buffer, skip pixel shader
if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
{
- memset(pixelmask, 1, span->length);
+ memset(pixelmask + span->startx, 1, span->endx - span->startx);
span->pixelmask = pixelmask;
- span->startx = 0;
- span->endx = span->length;
DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
}
}
{
#ifdef SSE2_PRESENT
int cullface = thread->cullface;
- int width = dpsoftrast.fb_width;
- int miny = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
- int maxy = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
+ int minx, maxx, miny, maxy;
+ int miny1, maxy1, miny2, maxy2;
__m128i fbmin, fbmax;
__m128 viewportcenter, viewportscale;
int firstvertex = command->firstvertex;
int y;
int e[3];
__m128i screeny;
- int starty, endy;
+ int starty, endy, bandy;
int numpoints;
int clipcase;
float clipdist[4];
__m128 screen[4];
DPSOFTRAST_State_Triangle *triangle;
DPSOFTRAST_Texture *texture;
- if (command->starty >= maxy || command->endy <= miny)
+ DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
+ miny = thread->fb_scissor[1];
+ maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
+ miny1 = bound(miny, thread->miny1, maxy);
+ maxy1 = bound(miny, thread->maxy1, maxy);
+ miny2 = bound(miny, thread->miny2, maxy);
+ maxy2 = bound(miny, thread->maxy2, maxy);
+ if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
{
if (!ATOMIC_DECREMENT(command->refcount))
{
}
return;
}
- DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
- fbmin = _mm_setr_epi16(0, miny, 0, miny, 0, miny, 0, miny);
- fbmax = _mm_sub_epi16(_mm_setr_epi16(width, maxy, width, maxy, width, maxy, width, maxy), _mm_set1_epi16(1));
+ minx = thread->fb_scissor[0];
+ maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
+ fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
+ fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
viewportscale = _mm_load_ps(thread->fb_viewportscale);
screen[3] = _mm_setzero_ps();
continue;
starty = _mm_extract_epi16(screenmin, 1);
endy = _mm_extract_epi16(screenmax, 1)+1;
+ if (starty >= maxy1 && endy <= miny2)
+ continue;
screeny = _mm_srai_epi32(screeni, 16);
}
}
}
}
-
- for (y = starty; y < endy;)
+
+ for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
+ for (; y < bandy;)
{
__m128 xcoords, xslope;
__m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
nexty = _mm_extract_epi16(ycc, 0);
- if(nexty >= endy) nexty = endy-1;
- if (_mm_ucomigt_ss(_mm_max_ss(screen[edge0n], screen[edge0p]), _mm_min_ss(screen[edge1n], screen[edge1p])))
- {
- int tmp = edge0n;
- edge0n = edge1n;
- edge1n = tmp;
- tmp = edge0p;
- edge0p = edge1p;
- edge1p = tmp;
- }
+ if (nexty >= bandy) nexty = bandy-1;
xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
_mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
+ if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
+ {
+ xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
+ xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
+ }
for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
{
int startx, endx, offset;
startx = _mm_cvtss_si32(xcoords);
endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
- if (startx < 0) startx = 0;
- if (endx > dpsoftrast.fb_width) endx = dpsoftrast.fb_width;
+ if (startx < minx)
+ {
+ if (startx < 0) startx = 0;
+ startx += (minx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
+ }
+ if (endx > maxx) endx = maxx;
if (startx >= endx) continue;
- for (offset = startx; offset < endx;)
+ for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
{
DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
span->triangle = thread->numtriangles;
span->x = offset;
span->y = y;
- span->length = endx - offset;
- if (span -> length > DPSOFTRAST_DRAW_MAXSPANLENGTH)
- span -> length = DPSOFTRAST_DRAW_MAXSPANLENGTH;
- offset += span->length;
+ span->startx = max(minx - offset, 0);
+ span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
+ if (span->startx >= span->endx)
+ continue;
if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
DPSOFTRAST_Draw_ProcessSpans(thread);
}
command->clipped = dpsoftrast.drawclipped;
command->refcount = dpsoftrast.numthreads;
-#ifdef USE_THREADS
- DPSOFTRAST_Draw_SyncCommands();
+ if (dpsoftrast.usethreads)
{
int i;
- int nexty = 0;
+ DPSOFTRAST_Draw_SyncCommands();
for (i = 0; i < dpsoftrast.numthreads; i++)
{
DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
- int y = nexty;
- nexty = ((i+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
- if (command->starty < nexty && command->endy > y && thread->starving)
- SDL_CondSignal(thread->drawcond);
+ if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
+ Thread_CondSignal(thread->drawcond);
}
}
-#else
- DPSOFTRAST_Draw_FlushThreads();
-#endif
+ else
+ {
+ DPSOFTRAST_Draw_FlushThreads();
+ }
}
static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
thread->commandoffset = commandoffset;
}
-#ifdef USE_THREADS
static int DPSOFTRAST_Draw_Thread(void *data)
{
DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
}
else
{
- SDL_LockMutex(thread->drawmutex);
+ Thread_LockMutex(thread->drawmutex);
if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
{
- if (thread->waiting) SDL_CondSignal(thread->waitcond);
+ if (thread->waiting) Thread_CondSignal(thread->waitcond);
thread->starving = true;
- SDL_CondWait(thread->drawcond, thread->drawmutex);
+ Thread_CondWait(thread->drawcond, thread->drawmutex);
thread->starving = false;
}
- SDL_UnlockMutex(thread->drawmutex);
+ Thread_UnlockMutex(thread->drawmutex);
}
}
return 0;
}
-#endif
static void DPSOFTRAST_Draw_FlushThreads(void)
{
DPSOFTRAST_State_Thread *thread;
int i;
DPSOFTRAST_Draw_SyncCommands();
-#ifdef USE_THREADS
- for (i = 0; i < dpsoftrast.numthreads; i++)
+ if (dpsoftrast.usethreads)
{
- thread = &dpsoftrast.threads[i];
- if (thread->commandoffset != dpsoftrast.drawcommand)
+ for (i = 0; i < dpsoftrast.numthreads; i++)
{
- SDL_LockMutex(thread->drawmutex);
- if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
- SDL_CondSignal(thread->drawcond);
- SDL_UnlockMutex(thread->drawmutex);
+ thread = &dpsoftrast.threads[i];
+ if (thread->commandoffset != dpsoftrast.drawcommand)
+ {
+ Thread_LockMutex(thread->drawmutex);
+ if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
+ Thread_CondSignal(thread->drawcond);
+ Thread_UnlockMutex(thread->drawmutex);
+ }
}
- }
-#endif
- for (i = 0; i < dpsoftrast.numthreads; i++)
- {
- thread = &dpsoftrast.threads[i];
-#ifdef USE_THREADS
- if (thread->commandoffset != dpsoftrast.drawcommand)
+ for (i = 0; i < dpsoftrast.numthreads; i++)
{
- SDL_LockMutex(thread->drawmutex);
+ thread = &dpsoftrast.threads[i];
if (thread->commandoffset != dpsoftrast.drawcommand)
{
- thread->waiting = true;
- SDL_CondWait(thread->waitcond, thread->drawmutex);
- thread->waiting = false;
+ Thread_LockMutex(thread->drawmutex);
+ if (thread->commandoffset != dpsoftrast.drawcommand)
+ {
+ thread->waiting = true;
+ Thread_CondWait(thread->waitcond, thread->drawmutex);
+ thread->waiting = false;
+ }
+ Thread_UnlockMutex(thread->drawmutex);
}
- SDL_UnlockMutex(thread->drawmutex);
}
-#else
- if (thread->commandoffset != dpsoftrast.drawcommand)
- DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
-#endif
+ }
+ else
+ {
+ for (i = 0; i < dpsoftrast.numthreads; i++)
+ {
+ thread = &dpsoftrast.threads[i];
+ if (thread->commandoffset != dpsoftrast.drawcommand)
+ DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
+ }
}
dpsoftrast.commandpool.usedcommands = 0;
}
DPSOFTRAST_Flush();
}
-void DPSOFTRAST_Init(int width, int height, int numthreads, unsigned int *colorpixels, unsigned int *depthpixels)
+int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
{
int i;
union
dpsoftrast.color[1] = 1;
dpsoftrast.color[2] = 1;
dpsoftrast.color[3] = 1;
-#ifdef USE_THREADS
- dpsoftrast.numthreads = bound(1, numthreads, 64);
-#else
- dpsoftrast.numthreads = 1;
-#endif
+ dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
+ dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
+ dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
for (i = 0; i < dpsoftrast.numthreads; i++)
{
thread->depthrange[1] = 1;
thread->polygonoffset[0] = 0;
thread->polygonoffset[1] = 0;
+
+ if (dpsoftrast.interlace)
+ {
+ thread->miny1 = (i*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
+ thread->maxy1 = ((i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
+ thread->miny2 = ((dpsoftrast.numthreads+i)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
+ thread->maxy2 = ((dpsoftrast.numthreads+i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
+ }
+ else
+ {
+ thread->miny1 = thread->miny2 = (i*dpsoftrast.fb_height)/dpsoftrast.numthreads;
+ thread->maxy1 = thread->maxy2 = ((i+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
+ }
thread->numspans = 0;
thread->numtriangles = 0;
thread->commandoffset = 0;
thread->waiting = false;
thread->starving = false;
-#ifdef USE_THREADS
- thread->waitcond = SDL_CreateCond();
- thread->drawcond = SDL_CreateCond();
- thread->drawmutex = SDL_CreateMutex();
-#endif
-
+
thread->validate = -1;
DPSOFTRAST_Validate(thread, -1);
-#ifdef USE_THREADS
- thread->thread = SDL_CreateThread(DPSOFTRAST_Draw_Thread, thread);
-#endif
+
+ if (dpsoftrast.usethreads)
+ {
+ thread->waitcond = Thread_CreateCond();
+ thread->drawcond = Thread_CreateCond();
+ thread->drawmutex = Thread_CreateMutex();
+ thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
+ }
}
+ return 0;
}
void DPSOFTRAST_Shutdown(void)
{
int i;
-#ifdef USE_THREADS
- if(dpsoftrast.numthreads > 0)
+ if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
{
DPSOFTRAST_State_Thread *thread;
for (i = 0; i < dpsoftrast.numthreads; i++)
{
thread = &dpsoftrast.threads[i];
- SDL_LockMutex(thread->drawmutex);
+ Thread_LockMutex(thread->drawmutex);
thread->index = -1;
- SDL_CondSignal(thread->drawcond);
- SDL_UnlockMutex(thread->drawmutex);
- SDL_WaitThread(thread->thread, NULL);
- SDL_DestroyCond(thread->waitcond);
- SDL_DestroyCond(thread->drawcond);
- SDL_DestroyMutex(thread->drawmutex);
+ Thread_CondSignal(thread->drawcond);
+ Thread_UnlockMutex(thread->drawmutex);
+ Thread_WaitThread(thread->thread, 0);
+ Thread_DestroyCond(thread->waitcond);
+ Thread_DestroyCond(thread->drawcond);
+ Thread_DestroyMutex(thread->drawmutex);
}
}
-#endif
for (i = 0;i < dpsoftrast.texture_end;i++)
if (dpsoftrast.texture[i].bytes)
MM_FREE(dpsoftrast.texture[i].bytes);