int scissor[4];
float depthrange[2];
float polygonoffset[2];
+ ALIGN(float clipplane[4]);
int shader_mode;
int shader_permutation;
dpsoftrast.uniform1i[command->index] = i0;
}
+DEFCOMMAND(24, ClipPlane, float clipplane[4];)
+static void DPSOFTRAST_Interpret_ClipPlane(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClipPlane *command)
+{
+ memcpy(thread->clipplane, command->clipplane, 4*sizeof(float));
+}
+void DPSOFTRAST_ClipPlane(float x, float y, float z, float w)
+{
+ DPSOFTRAST_Command_ClipPlane *command = DPSOFTRAST_ALLOCATECOMMAND(ClipPlane);
+ x /= dpsoftrast.fb_viewportscale[1];
+ y /= dpsoftrast.fb_viewportscale[2];
+ z /= dpsoftrast.fb_viewportscale[3];
+ w /= dpsoftrast.fb_viewportscale[0];
+ w -= dpsoftrast.fb_viewportcenter[1]*x + dpsoftrast.fb_viewportcenter[2]*y + dpsoftrast.fb_viewportcenter[3]*z + dpsoftrast.fb_viewportcenter[0]*w;
+ command->clipplane[0] = x;
+ command->clipplane[1] = y;
+ command->clipplane[2] = z;
+ command->clipplane[3] = w;
+}
+
#ifdef SSE_POSSIBLE
static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
{
float wslope = triangle->w[0];
float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
float endz = 1.0f / (w + wslope * startx);
+ if (triangle->w[0] == 0)
+ {
+ // LordHavoc: fast flat polygons (HUD/menu)
+ for (x = startx;x < endx;x++)
+ zf[x] = endz;
+ return;
+ }
for (x = startx;x < endx;)
{
int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
int x;
int startx = span->startx;
int endx = span->endx;
+ int subx;
const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
unsigned char * RESTRICT pixelmask = span->pixelmask;
unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
// handle alphatest now (this affects depth writes too)
if (thread->alphatest)
for (x = startx;x < endx;x++)
- if (in4ub[x*4+3] < 0.5f)
+ if (in4ub[x*4+3] < 128)
pixelmask[x] = false;
- // FIXME: this does not handle bigendian
+ // LordHavoc: clear pixelmask for some pixels in alphablend cases, this
+ // helps sprites, text and hud artwork
switch(thread->fb_blendmode)
{
+ case DPSOFTRAST_BLENDMODE_ALPHA:
+ case DPSOFTRAST_BLENDMODE_ADDALPHA:
+ case DPSOFTRAST_BLENDMODE_SUBALPHA:
+ for (x = startx;x < endx;x++)
+ if (in4ub[x*4+3] < 1)
+ pixelmask[x] = false;
+ break;
case DPSOFTRAST_BLENDMODE_OPAQUE:
- for (x = startx;x + 4 <= endx;)
+ case DPSOFTRAST_BLENDMODE_ADD:
+ case DPSOFTRAST_BLENDMODE_INVMOD:
+ case DPSOFTRAST_BLENDMODE_MUL:
+ case DPSOFTRAST_BLENDMODE_MUL2:
+ case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
+ case DPSOFTRAST_BLENDMODE_INVADD:
+ break;
+ }
+ // put some special values at the end of the mask to ensure the loops end
+ pixelmask[endx] = 1;
+ pixelmask[endx+1] = 0;
+ // LordHavoc: use a double loop to identify subspans, this helps the
+ // optimized copy/blend loops to perform at their best, most triangles
+ // have only one run of pixels, and do the search using wide reads...
+ x = startx;
+ while (x < endx)
+ {
+ // if this pixel is masked off, it's probably not alone...
+ if (!pixelmask[x])
{
- if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
+ x++;
+#if 1
+ if (x + 8 < endx)
{
- _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
- x += 4;
+ // the 4-item search must be aligned or else it stalls badly
+ if ((x & 3) && !pixelmask[x]) x++;
+ if ((x & 3) && !pixelmask[x]) x++;
+ if ((x & 3) && !pixelmask[x]) x++;
+ while (*((unsigned int *)pixelmask + x) == 0x00000000)
+ x += 4;
+ }
+#endif
+ for (;!pixelmask[x];x++)
+ ;
+ // rather than continue the loop, just check the end variable
+ if (x >= endx)
+ break;
+ }
+ // find length of subspan
+ subx = x + 1;
+#if 1
+ if (x + 8 < endx)
+ {
+ if ((subx & 3) && pixelmask[subx]) subx++;
+ if ((subx & 3) && pixelmask[subx]) subx++;
+ if ((subx & 3) && pixelmask[subx]) subx++;
+ while (*((unsigned int *)pixelmask + subx) == 0x01010101)
+ subx += 4;
+ }
+#endif
+ for (;pixelmask[subx];subx++)
+ ;
+ // the checks can overshoot, so make sure to clip it...
+ if (subx > endx)
+ subx = endx;
+ // now that we know the subspan length... process!
+ switch(thread->fb_blendmode)
+ {
+ case DPSOFTRAST_BLENDMODE_OPAQUE:
+#if 0
+ if (subx - x >= 16)
+ {
+ memcpy(pixeli + x, ini + x, (subx - x) * sizeof(pixeli[x]));
+ x = subx;
}
else
+#elif 1
+ while (x + 16 <= subx)
+ {
+ _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
+ _mm_storeu_si128((__m128i *)&pixeli[x+4], _mm_loadu_si128((const __m128i *)&ini[x+4]));
+ _mm_storeu_si128((__m128i *)&pixeli[x+8], _mm_loadu_si128((const __m128i *)&ini[x+8]));
+ _mm_storeu_si128((__m128i *)&pixeli[x+12], _mm_loadu_si128((const __m128i *)&ini[x+12]));
+ x += 16;
+ }
+#endif
{
- if (pixelmask[x])
+ while (x + 4 <= subx)
+ {
+ _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
+ x += 4;
+ }
+ if (x + 2 <= subx)
+ {
pixeli[x] = ini[x];
- x++;
+ pixeli[x+1] = ini[x+1];
+ x += 2;
+ }
+ if (x < subx)
+ {
+ pixeli[x] = ini[x];
+ x++;
+ }
}
- }
- for (;x < endx;x++)
- if (pixelmask[x])
- pixeli[x] = ini[x];
- break;
- case DPSOFTRAST_BLENDMODE_ALPHA:
- #define FINISHBLEND(blend2, blend1) \
- for (x = startx;x + 1 < endx;x += 2) \
- { \
- __m128i src, dst; \
- switch (*(const unsigned short*)&pixelmask[x]) \
+ break;
+ case DPSOFTRAST_BLENDMODE_ALPHA:
+ #define FINISHBLEND(blend2, blend1) \
+ for (;x + 1 < subx;x += 2) \
{ \
- case 0x0101: \
+ __m128i src, dst; \
src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
blend2; \
_mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
- continue; \
- case 0x0100: \
- src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
- dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
- blend1; \
- pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
- continue; \
- case 0x0001: \
+ } \
+ if (x < subx) \
+ { \
+ __m128i src, dst; \
src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
blend1; \
pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
- continue; \
- } \
- break; \
- } \
- for(;x < endx; x++) \
- { \
- __m128i src, dst; \
- if (!pixelmask[x]) \
- continue; \
- src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
- dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
- blend1; \
- pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
+ x++; \
+ }
+ FINISHBLEND({
+ __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
+ dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
+ }, {
+ __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
+ dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
+ });
+ break;
+ case DPSOFTRAST_BLENDMODE_ADDALPHA:
+ FINISHBLEND({
+ __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
+ dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
+ }, {
+ __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
+ dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
+ });
+ break;
+ case DPSOFTRAST_BLENDMODE_ADD:
+ FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
+ break;
+ case DPSOFTRAST_BLENDMODE_INVMOD:
+ FINISHBLEND({
+ dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
+ }, {
+ dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
+ });
+ break;
+ case DPSOFTRAST_BLENDMODE_MUL:
+ FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
+ break;
+ case DPSOFTRAST_BLENDMODE_MUL2:
+ FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
+ break;
+ case DPSOFTRAST_BLENDMODE_SUBALPHA:
+ FINISHBLEND({
+ __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
+ dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
+ }, {
+ __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
+ dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
+ });
+ break;
+ case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
+ FINISHBLEND({
+ __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
+ dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
+ }, {
+ __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
+ dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
+ });
+ break;
+ case DPSOFTRAST_BLENDMODE_INVADD:
+ FINISHBLEND({
+ dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
+ }, {
+ dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
+ });
+ break;
}
-
- FINISHBLEND({
- __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
- dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
- }, {
- __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
- dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
- });
- break;
- case DPSOFTRAST_BLENDMODE_ADDALPHA:
- FINISHBLEND({
- __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
- dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
- }, {
- __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
- dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
- });
- break;
- case DPSOFTRAST_BLENDMODE_ADD:
- FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
- break;
- case DPSOFTRAST_BLENDMODE_INVMOD:
- FINISHBLEND({
- dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
- }, {
- dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
- });
- break;
- case DPSOFTRAST_BLENDMODE_MUL:
- FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
- break;
- case DPSOFTRAST_BLENDMODE_MUL2:
- FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
- break;
- case DPSOFTRAST_BLENDMODE_SUBALPHA:
- FINISHBLEND({
- __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
- dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
- }, {
- __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
- dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
- });
- break;
- case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
- FINISHBLEND({
- __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
- dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
- }, {
- __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
- dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
- });
- break;
- case DPSOFTRAST_BLENDMODE_INVADD:
- FINISHBLEND({
- dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
- }, {
- dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
- });
- break;
}
#endif
}
tcimax[1] = texture->mipmap[mip][3]-1;
tciwrapmask[0] = texture->mipmap[mip][2]-1;
tciwrapmask[1] = texture->mipmap[mip][3]-1;
- endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
- endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
+ endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
+ endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
+ if (filter)
+ {
+ endtc[0] -= 0.5f;
+ endtc[1] -= 0.5f;
+ }
for (x = startx;x < endx;)
{
unsigned int subtc[2];
unsigned int substep[2];
- float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
+ float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
if (nextsub >= endx)
{
nextsub = endsub = endx-1;
- if (x < nextsub) subscale = 65536.0f / (nextsub - x);
+ if (x < nextsub) subscale = 4096.0f / (nextsub - x);
}
tc[0] = endtc[0];
tc[1] = endtc[1];
- endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
- endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
+ endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
+ endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
+ if (filter)
+ {
+ endtc[0] -= 0.5f;
+ endtc[1] -= 0.5f;
+ }
substep[0] = (endtc[0] - tc[0]) * subscale;
substep[1] = (endtc[1] - tc[1]) * subscale;
- subtc[0] = tc[0] * (1<<16);
- subtc[1] = tc[1] * (1<<16);
+ subtc[0] = tc[0] * (1<<12);
+ subtc[1] = tc[1] * (1<<12);
if (filter)
{
if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
- tci[0] = subtc[0]>>16;
- tci[1] = subtc[1]>>16;
+ tci[0] = subtc[0]>>12;
+ tci[1] = subtc[1]>>12;
tci1[0] = tci[0] + 1;
tci1[1] = tci[1] + 1;
tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
- tci[0] = subtc[0]>>16;
- tci[1] = subtc[1]>>16;
+ tci[0] = subtc[0]>>12;
+ tci[1] = subtc[1]>>12;
tci1[0] = tci[0] + 1;
tci1[1] = tci[1] + 1;
tci[0] &= tciwrapmask[0];
{
for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
{
- tci[0] = subtc[0]>>16;
- tci[1] = subtc[1]>>16;
+ tci[0] = subtc[0]>>12;
+ tci[1] = subtc[1]>>12;
tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
{
for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
{
- tci[0] = subtc[0]>>16;
- tci[1] = subtc[1]>>16;
+ tci[0] = subtc[0]>>12;
+ tci[1] = subtc[1]>>12;
tci[0] &= tciwrapmask[0];
tci[1] &= tciwrapmask[1];
pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
__m128i subtc, substep, endsubtc;
int filter;
int mip;
+ int affine; // LordHavoc: optimized affine texturing case
unsigned int * RESTRICT outi = (unsigned int *)out4ub;
const unsigned char * RESTRICT pixelbase;
DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
outi[x] = k;
return;
}
+ affine = zf[startx] == zf[endx-1];
filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
flags = texture->flags;
tcscale = _mm_cvtepi32_ps(tcsize);
data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
- endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
+ endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
+ if (filter)
+ endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
tcmax = _mm_packs_epi32(tcmask, tcmask);
{
int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
__m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
- if (nextsub >= endx)
+ if (nextsub >= endx || affine)
{
nextsub = endsub = endx-1;
if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
}
tc = endtc;
subtc = endsubtc;
- endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
+ endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
+ if (filter)
+ endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
// texture reads
unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
- //unsigned char buffer_texture_refractionbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
// varyings
// read textures
DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
- //DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_refractionbgra8, GL20TU_REFRACTION, DPSOFTRAST_ARRAY_TEXCOORD1, buffer_z);
// read varyings
DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD1); // or POSITION?
// " vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
-
+
// " vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
// " dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
{
- unsigned int tc[2] = { ScreenTexCoord[0] * (texture->mipmap[0][2]<<16) - 32768, ScreenTexCoord[1] * (texture->mipmap[0][3]<<16) - 32678};
+ unsigned int tc[2] = { ScreenTexCoord[0] * (texture->mipmap[0][2]<<12) - 2048, ScreenTexCoord[1] * (texture->mipmap[0][3]<<12) - 2048};
unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
- int tci[2] = { tc[0]>>16, tc[1]>>16 };
+ int tci[2] = { tc[0]>>12, tc[1]>>12 };
int tci1[2] = { tci[0] + 1, tci[1] + 1 };
tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
}
else
{
- int tci[2] = { ScreenTexCoord[0] * texture->mipmap[0][2] - 0.5, ScreenTexCoord[1] * texture->mipmap[0][3] - 0.5 };
- int tci1[2] = { tci[0] + 1, tci[1] + 1 };
+ int tci[2] = { ScreenTexCoord[0] * texture->mipmap[0][2], ScreenTexCoord[1] * texture->mipmap[0][3] };
tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
- tci1[0] = tci1[0] >= 0 ? (tci1[0] <= texture->mipmap[0][2]-1 ? tci1[0] : texture->mipmap[0][2]-1) : 0;
- tci1[1] = tci1[1] >= 0 ? (tci1[1] <= texture->mipmap[0][3]-1 ? tci1[1] : texture->mipmap[0][3]-1) : 0;
pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
c[0] = pixel[0][0];
c[1] = pixel[0][1];
unsigned int d;
DPSOFTRAST_State_Triangle *triangle;
DPSOFTRAST_State_Span *span;
- unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
+ unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH+4]; // LordHavoc: padded to allow some termination bytes
for (i = 0; i < thread->numspans; i++)
{
span = &thread->spans[i];
int numpoints;
int clipcase;
float clipdist[4];
+ float clip0origin, clip0slope;
+ int clip0dir;
__m128 triangleedge1, triangleedge2, trianglenormal;
__m128 clipfrac[3];
__m128 screen[4];
_mm_store_ss(&triangle->w[0], attribxslope);
_mm_store_ss(&triangle->w[1], attribyslope);
_mm_store_ss(&triangle->w[2], attriborigin);
+
+ clip0origin = 0;
+ clip0slope = 0;
+ clip0dir = 0;
+ if(thread->clipplane[0] || thread->clipplane[1] || thread->clipplane[2])
+ {
+ float cliporigin, clipxslope, clipyslope;
+ attriborigin = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(2, 2, 2, 2));
+ attribedge1 = _mm_sub_ss(_mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
+ attribedge2 = _mm_sub_ss(_mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
+ attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
+ attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
+ attriborigin = _mm_sub_ss(attriborigin, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
+ cliporigin = _mm_cvtss_f32(attriborigin)*thread->clipplane[2] + thread->clipplane[3];
+ clipxslope = thread->clipplane[0] + _mm_cvtss_f32(attribxslope)*thread->clipplane[2];
+ clipyslope = thread->clipplane[1] + _mm_cvtss_f32(attribyslope)*thread->clipplane[2];
+ if(clipxslope != 0)
+ {
+ clip0origin = -cliporigin/clipxslope;
+ clip0slope = -clipyslope/clipxslope;
+ clip0dir = clipxslope > 0 ? 1 : -1;
+ }
+ else if(clipyslope > 0)
+ {
+ clip0origin = dpsoftrast.fb_width*floor(cliporigin/clipyslope);
+ clip0slope = dpsoftrast.fb_width;
+ clip0dir = -1;
+ }
+ else if(clipyslope < 0)
+ {
+ clip0origin = dpsoftrast.fb_width*ceil(cliporigin/clipyslope);
+ clip0slope = -dpsoftrast.fb_width;
+ clip0dir = -1;
+ }
+ else if(clip0origin < 0) continue;
+ }
+
mipedgescale = _mm_setzero_ps();
for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
{
int yccmask = _mm_movemask_epi8(ycc);
int edge0p, edge0n, edge1p, edge1n;
int nexty;
+ float clip0;
if (numpoints == 4)
{
switch(yccmask)
xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
}
- for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
+ clip0 = clip0origin + (y+0.5f)*clip0slope;
+ for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope), clip0 += clip0slope)
{
- int startx, endx, offset;
+ int startx, endx, clipx = minx, offset;
startx = _mm_cvtss_si32(xcoords);
endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
if (startx < minx)
}
if (endx > maxx) endx = maxx;
if (startx >= endx) continue;
+
+ if (clip0dir)
+ {
+ if (clip0dir > 0)
+ {
+ if (startx < clip0)
+ {
+ if(endx <= clip0) continue;
+ clipx = max((int)clip0, minx);
+ startx += (clipx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
+ }
+ }
+ else if (endx > clip0)
+ {
+ if(startx >= clip0) continue;
+ endx = (int)clip0;
+ }
+ }
+
for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
{
DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
span->triangle = thread->numtriangles;
span->x = offset;
span->y = y;
- span->startx = max(minx - offset, 0);
+ span->startx = max(clipx - offset, 0);
span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
if (span->startx >= span->endx)
continue;
INTERPCOMMAND(UniformMatrix4f)
INTERPCOMMAND(Uniform1i)
INTERPCOMMAND(SetRenderTargets)
+ INTERPCOMMAND(ClipPlane)
case DPSOFTRAST_OPCODE_Draw:
DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
thread->depthrange[1] = 1;
thread->polygonoffset[0] = 0;
thread->polygonoffset[1] = 0;
+ thread->clipplane[0] = 0;
+ thread->clipplane[1] = 0;
+ thread->clipplane[2] = 0;
+ thread->clipplane[3] = 1;
DPSOFTRAST_RecalcThread(thread);