]> de.git.xonotic.org Git - xonotic/darkplaces.git/blobdiff - dpsoftrast.c
added simple affine check to accelerate texture fetches on 2D art
[xonotic/darkplaces.git] / dpsoftrast.c
index d733c9a511dafb013b3b49f53e0d04297444409c..b0a5a3ad50f9651fa3a5849c6a634f053a3b76de 100644 (file)
@@ -235,6 +235,7 @@ typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
        int scissor[4];
        float depthrange[2];
        float polygonoffset[2];
+       ALIGN(float clipplane[4]);
 
        int shader_mode;
        int shader_permutation;
@@ -1435,6 +1436,25 @@ void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
        dpsoftrast.uniform1i[command->index] = i0;
 }
 
+DEFCOMMAND(24, ClipPlane, float clipplane[4];)
+static void DPSOFTRAST_Interpret_ClipPlane(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClipPlane *command)
+{
+       memcpy(thread->clipplane, command->clipplane, 4*sizeof(float));
+}
+void DPSOFTRAST_ClipPlane(float x, float y, float z, float w)
+{
+       DPSOFTRAST_Command_ClipPlane *command = DPSOFTRAST_ALLOCATECOMMAND(ClipPlane);
+       x /= dpsoftrast.fb_viewportscale[1];
+       y /= dpsoftrast.fb_viewportscale[2];
+       z /= dpsoftrast.fb_viewportscale[3];
+       w /= dpsoftrast.fb_viewportscale[0];
+       w -= dpsoftrast.fb_viewportcenter[1]*x + dpsoftrast.fb_viewportcenter[2]*y + dpsoftrast.fb_viewportcenter[3]*z + dpsoftrast.fb_viewportcenter[0]*w; 
+       command->clipplane[0] = x;
+       command->clipplane[1] = y;
+       command->clipplane[2] = z;
+       command->clipplane[3] = w;
+}
+
 #ifdef SSE_POSSIBLE
 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
 {
@@ -1989,6 +2009,13 @@ void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAS
        float wslope = triangle->w[0];
        float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
        float endz = 1.0f / (w + wslope * startx);
+       if (triangle->w[0] == 0)
+       {
+               // LordHavoc: fast flat polygons (HUD/menu)
+               for (x = startx;x < endx;x++)
+                       zf[x] = endz;
+               return;
+       }
        for (x = startx;x < endx;)
        {
                int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
@@ -2186,6 +2213,7 @@ void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPS
        int x;
        int startx = span->startx;
        int endx = span->endx;
+       int subx;
        const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
        unsigned char * RESTRICT pixelmask = span->pixelmask;
        unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
@@ -2197,127 +2225,193 @@ void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPS
        // handle alphatest now (this affects depth writes too)
        if (thread->alphatest)
                for (x = startx;x < endx;x++)
-                       if (in4ub[x*4+3] < 0.5f)
+                       if (in4ub[x*4+3] < 128)
                                pixelmask[x] = false;
-       // FIXME: this does not handle bigendian
+       // LordHavoc: clear pixelmask for some pixels in alphablend cases, this
+       // helps sprites, text and hud artwork
        switch(thread->fb_blendmode)
        {
+       case DPSOFTRAST_BLENDMODE_ALPHA:
+       case DPSOFTRAST_BLENDMODE_ADDALPHA:
+       case DPSOFTRAST_BLENDMODE_SUBALPHA:
+               for (x = startx;x < endx;x++)
+                       if (in4ub[x*4+3] < 1)
+                               pixelmask[x] = false;
+               break;
        case DPSOFTRAST_BLENDMODE_OPAQUE:
-               for (x = startx;x + 4 <= endx;)
+       case DPSOFTRAST_BLENDMODE_ADD:
+       case DPSOFTRAST_BLENDMODE_INVMOD:
+       case DPSOFTRAST_BLENDMODE_MUL:
+       case DPSOFTRAST_BLENDMODE_MUL2:
+       case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
+       case DPSOFTRAST_BLENDMODE_INVADD:
+               break;
+       }
+       // put some special values at the end of the mask to ensure the loops end
+       pixelmask[endx] = 1;
+       pixelmask[endx+1] = 0;
+       // LordHavoc: use a double loop to identify subspans, this helps the
+       // optimized copy/blend loops to perform at their best, most triangles
+       // have only one run of pixels, and do the search using wide reads...
+       x = startx;
+       while (x < endx)
+       {
+               // if this pixel is masked off, it's probably not alone...
+               if (!pixelmask[x])
                {
-                       if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
+                       x++;
+#if 1
+                       if (x + 8 < endx)
                        {
-                               _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
-                               x += 4;
+                               // the 4-item search must be aligned or else it stalls badly
+                               if ((x & 3) && !pixelmask[x]) x++;
+                               if ((x & 3) && !pixelmask[x]) x++;
+                               if ((x & 3) && !pixelmask[x]) x++;
+                               while (*((unsigned int *)pixelmask + x) == 0x00000000)
+                                       x += 4;
+                       }
+#endif
+                       for (;!pixelmask[x];x++)
+                               ;
+                       // rather than continue the loop, just check the end variable
+                       if (x >= endx)
+                               break;
+               }
+               // find length of subspan
+               subx = x + 1;
+#if 1
+               if (x + 8 < endx)
+               {
+                       if ((subx & 3) && pixelmask[subx]) subx++;
+                       if ((subx & 3) && pixelmask[subx]) subx++;
+                       if ((subx & 3) && pixelmask[subx]) subx++;
+                       while (*((unsigned int *)pixelmask + subx) == 0x01010101)
+                               subx += 4;
+               }
+#endif
+               for (;pixelmask[subx];subx++)
+                       ;
+               // the checks can overshoot, so make sure to clip it...
+               if (subx > endx)
+                       subx = endx;
+               // now that we know the subspan length...  process!
+               switch(thread->fb_blendmode)
+               {
+               case DPSOFTRAST_BLENDMODE_OPAQUE:
+#if 0
+                       if (subx - x >= 16)
+                       {
+                               memcpy(pixeli + x, ini + x, (subx - x) * sizeof(pixeli[x]));
+                               x = subx;
                        }
                        else
+#elif 1
+                       while (x + 16 <= subx)
+                       {
+                               _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
+                               _mm_storeu_si128((__m128i *)&pixeli[x+4], _mm_loadu_si128((const __m128i *)&ini[x+4]));
+                               _mm_storeu_si128((__m128i *)&pixeli[x+8], _mm_loadu_si128((const __m128i *)&ini[x+8]));
+                               _mm_storeu_si128((__m128i *)&pixeli[x+12], _mm_loadu_si128((const __m128i *)&ini[x+12]));
+                               x += 16;
+                       }
+#endif
                        {
-                               if (pixelmask[x])
+                               while (x + 4 <= subx)
+                               {
+                                       _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
+                                       x += 4;
+                               }
+                               if (x + 2 <= subx)
+                               {
                                        pixeli[x] = ini[x];
-                               x++;
+                                       pixeli[x+1] = ini[x+1];
+                                       x += 2;
+                               }
+                               if (x < subx)
+                               {
+                                       pixeli[x] = ini[x];
+                                       x++;
+                               }
                        }
-               }
-               for (;x < endx;x++)
-                       if (pixelmask[x])
-                               pixeli[x] = ini[x];
-               break;
-       case DPSOFTRAST_BLENDMODE_ALPHA:
-       #define FINISHBLEND(blend2, blend1) \
-               for (x = startx;x + 1 < endx;x += 2) \
-               { \
-                       __m128i src, dst; \
-                       switch (*(const unsigned short*)&pixelmask[x]) \
+                       break;
+               case DPSOFTRAST_BLENDMODE_ALPHA:
+               #define FINISHBLEND(blend2, blend1) \
+                       for (;x + 1 < subx;x += 2) \
                        { \
-                       case 0x0101: \
+                               __m128i src, dst; \
                                src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
                                dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
                                blend2; \
                                _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
-                               continue; \
-                       case 0x0100: \
-                               src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
-                               dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
-                               blend1; \
-                               pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst));  \
-                               continue; \
-                       case 0x0001: \
+                       } \
+                       if (x < subx) \
+                       { \
+                               __m128i src, dst; \
                                src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
                                dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
                                blend1; \
                                pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
-                               continue; \
-                       } \
-                       break; \
-               } \
-               for(;x < endx; x++) \
-               { \
-                       __m128i src, dst; \
-                       if (!pixelmask[x]) \
-                               continue; \
-                       src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
-                       dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
-                       blend1; \
-                       pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
+                               x++; \
+                       }
+                       FINISHBLEND({
+                               __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
+                               dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
+                       }, {
+                               __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
+                               dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
+                       });
+                       break;
+               case DPSOFTRAST_BLENDMODE_ADDALPHA:
+                       FINISHBLEND({
+                               __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
+                               dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
+                       }, {
+                               __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
+                               dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
+                       });
+                       break;
+               case DPSOFTRAST_BLENDMODE_ADD:
+                       FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
+                       break;
+               case DPSOFTRAST_BLENDMODE_INVMOD:
+                       FINISHBLEND({
+                               dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
+                       }, {
+                               dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
+                       });
+                       break;
+               case DPSOFTRAST_BLENDMODE_MUL:
+                       FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
+                       break;
+               case DPSOFTRAST_BLENDMODE_MUL2:
+                       FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
+                       break;
+               case DPSOFTRAST_BLENDMODE_SUBALPHA:
+                       FINISHBLEND({
+                               __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
+                               dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
+                       }, {
+                               __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
+                               dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
+                       });
+                       break;
+               case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
+                       FINISHBLEND({
+                               __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
+                               dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
+                       }, {
+                               __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
+                               dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
+                       });
+                       break;
+               case DPSOFTRAST_BLENDMODE_INVADD:
+                       FINISHBLEND({
+                               dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
+                       }, {
+                               dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
+                       });
+                       break;
                }
-
-               FINISHBLEND({
-                       __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
-                       dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
-               }, {
-                       __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
-                       dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
-               });
-               break;
-       case DPSOFTRAST_BLENDMODE_ADDALPHA:
-               FINISHBLEND({
-                       __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
-                       dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
-               }, {
-                       __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
-                       dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
-               });
-               break;
-       case DPSOFTRAST_BLENDMODE_ADD:
-               FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
-               break;
-       case DPSOFTRAST_BLENDMODE_INVMOD:
-               FINISHBLEND({
-                       dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
-               }, {
-                       dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
-               });
-               break;
-       case DPSOFTRAST_BLENDMODE_MUL:
-               FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
-               break;
-       case DPSOFTRAST_BLENDMODE_MUL2:
-               FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
-               break;
-       case DPSOFTRAST_BLENDMODE_SUBALPHA:
-               FINISHBLEND({
-                       __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
-                       dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
-               }, {
-                       __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
-                       dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
-               });
-               break;
-       case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
-               FINISHBLEND({
-                       __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
-                       dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
-               }, {
-                       __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
-                       dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
-               });
-               break;
-       case DPSOFTRAST_BLENDMODE_INVADD:
-               FINISHBLEND({
-                       dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
-               }, {
-                       dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
-               });
-               break;
        }
 #endif
 }
@@ -2386,27 +2480,37 @@ void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, cons
        tcimax[1] = texture->mipmap[mip][3]-1;
        tciwrapmask[0] = texture->mipmap[mip][2]-1;
        tciwrapmask[1] = texture->mipmap[mip][3]-1;
-       endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
-       endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
+       endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
+       endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
+       if (filter)
+       {
+               endtc[0] -= 0.5f;
+               endtc[1] -= 0.5f;
+       }
        for (x = startx;x < endx;)
        {
                unsigned int subtc[2];
                unsigned int substep[2];
-               float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
+               float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
                int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
                if (nextsub >= endx)
                {
                        nextsub = endsub = endx-1;      
-                       if (x < nextsub) subscale = 65536.0f / (nextsub - x);
+                       if (x < nextsub) subscale = 4096.0f / (nextsub - x);
                }
                tc[0] = endtc[0];
                tc[1] = endtc[1];
-               endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
-               endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
+               endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
+               endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
+               if (filter)
+               {
+                       endtc[0] -= 0.5f;
+                       endtc[1] -= 0.5f;
+               }
                substep[0] = (endtc[0] - tc[0]) * subscale;
                substep[1] = (endtc[1] - tc[1]) * subscale;
-               subtc[0] = tc[0] * (1<<16);
-               subtc[1] = tc[1] * (1<<16);
+               subtc[0] = tc[0] * (1<<12);
+               subtc[1] = tc[1] * (1<<12);
                if (filter)
                {
                        if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
@@ -2416,8 +2520,8 @@ void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, cons
                                        unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
                                        unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
                                        unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
-                                       tci[0] = subtc[0]>>16;
-                                       tci[1] = subtc[1]>>16;
+                                       tci[0] = subtc[0]>>12;
+                                       tci[1] = subtc[1]>>12;
                                        tci1[0] = tci[0] + 1;
                                        tci1[1] = tci[1] + 1;
                                        tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
@@ -2445,8 +2549,8 @@ void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, cons
                                        unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
                                        unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
                                        unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
-                                       tci[0] = subtc[0]>>16;
-                                       tci[1] = subtc[1]>>16;
+                                       tci[0] = subtc[0]>>12;
+                                       tci[1] = subtc[1]>>12;
                                        tci1[0] = tci[0] + 1;
                                        tci1[1] = tci[1] + 1;
                                        tci[0] &= tciwrapmask[0];
@@ -2472,8 +2576,8 @@ void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, cons
                {
                        for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
                        {
-                               tci[0] = subtc[0]>>16;
-                               tci[1] = subtc[1]>>16;
+                               tci[0] = subtc[0]>>12;
+                               tci[1] = subtc[1]>>12;
                                tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
                                tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
                                pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
@@ -2491,8 +2595,8 @@ void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, cons
                {
                        for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
                        {
-                               tci[0] = subtc[0]>>16;
-                               tci[1] = subtc[1]>>16;
+                               tci[0] = subtc[0]>>12;
+                               tci[1] = subtc[1]>>12;
                                tci[0] &= tciwrapmask[0];
                                tci[1] &= tciwrapmask[1];
                                pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
@@ -2522,6 +2626,7 @@ void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread,
        __m128i subtc, substep, endsubtc;
        int filter;
        int mip;
+       int affine; // LordHavoc: optimized affine texturing case
        unsigned int * RESTRICT outi = (unsigned int *)out4ub;
        const unsigned char * RESTRICT pixelbase;
        DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
@@ -2541,6 +2646,7 @@ void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread,
                        outi[x] = k;
                return;
        }
+       affine = zf[startx] == zf[endx-1];
        filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
        DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
        flags = texture->flags;
@@ -2549,7 +2655,9 @@ void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread,
        tcscale = _mm_cvtepi32_ps(tcsize);
        data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
        slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
-       endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
+       endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
+       if (filter)
+               endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
        endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
        tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
        tcmax = _mm_packs_epi32(tcmask, tcmask);
@@ -2557,14 +2665,16 @@ void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread,
        {
                int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
                __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
-               if (nextsub >= endx)
+               if (nextsub >= endx || affine)
                {
                        nextsub = endsub = endx-1;
                        if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
                }       
                tc = endtc;
                subtc = endsubtc;
-               endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
+               endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
+               if (filter)
+                       endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
                substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
                endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
                subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
@@ -4389,7 +4499,6 @@ void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DP
 
        // texture reads
        unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
-       //unsigned char buffer_texture_refractionbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
        unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
 
        // varyings
@@ -4411,7 +4520,6 @@ void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DP
        // read textures
        DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
        DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
-       //DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_refractionbgra8, GL20TU_REFRACTION, DPSOFTRAST_ARRAY_TEXCOORD1, buffer_z);
 
        // read varyings
        DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD1); // or POSITION?
@@ -4441,7 +4549,7 @@ void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DP
 
                // "    vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
                iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
-        
+               
                // "    vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
                SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
                SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
@@ -4457,11 +4565,11 @@ void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DP
                // "    dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
                if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
                {
-                       unsigned int tc[2] = { ScreenTexCoord[0] * (texture->mipmap[0][2]<<16) - 32768, ScreenTexCoord[1] * (texture->mipmap[0][3]<<16) - 32678};
+                       unsigned int tc[2] = { ScreenTexCoord[0] * (texture->mipmap[0][2]<<12) - 2048, ScreenTexCoord[1] * (texture->mipmap[0][3]<<12) - 2048};
                        unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
                        unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
                        unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
-                       int tci[2] = { tc[0]>>16, tc[1]>>16 };
+                       int tci[2] = { tc[0]>>12, tc[1]>>12 };
                        int tci1[2] = { tci[0] + 1, tci[1] + 1 };
                        tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
                        tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
@@ -4477,12 +4585,9 @@ void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DP
                }
                else
                {
-                       int tci[2] = { ScreenTexCoord[0] * texture->mipmap[0][2] - 0.5, ScreenTexCoord[1] * texture->mipmap[0][3] - 0.5 };
-                       int tci1[2] = { tci[0] + 1, tci[1] + 1 };
+                       int tci[2] = { ScreenTexCoord[0] * texture->mipmap[0][2], ScreenTexCoord[1] * texture->mipmap[0][3] };
                        tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
                        tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
-                       tci1[0] = tci1[0] >= 0 ? (tci1[0] <= texture->mipmap[0][2]-1 ? tci1[0] : texture->mipmap[0][2]-1) : 0;
-                       tci1[1] = tci1[1] >= 0 ? (tci1[1] <= texture->mipmap[0][3]-1 ? tci1[1] : texture->mipmap[0][3]-1) : 0;
                        pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
                        c[0] = pixel[0][0];
                        c[1] = pixel[0][1];
@@ -4616,7 +4721,7 @@ void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
        unsigned int d;
        DPSOFTRAST_State_Triangle *triangle;
        DPSOFTRAST_State_Span *span;
-       unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
+       unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH+4]; // LordHavoc: padded to allow some termination bytes
        for (i = 0; i < thread->numspans; i++)
        {
                span = &thread->spans[i];
@@ -4705,6 +4810,8 @@ static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAS
        int numpoints;
        int clipcase;
        float clipdist[4];
+       float clip0origin, clip0slope;
+       int clip0dir;
        __m128 triangleedge1, triangleedge2, trianglenormal;
        __m128 clipfrac[3];
        __m128 screen[4];
@@ -4929,6 +5036,43 @@ static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAS
                        _mm_store_ss(&triangle->w[0], attribxslope);
                        _mm_store_ss(&triangle->w[1], attribyslope);
                        _mm_store_ss(&triangle->w[2], attriborigin);
+                       
+                       clip0origin = 0;
+                       clip0slope = 0;
+                       clip0dir = 0;
+                       if(thread->clipplane[0] || thread->clipplane[1] || thread->clipplane[2])
+                       {
+                               float cliporigin, clipxslope, clipyslope;
+                               attriborigin = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(2, 2, 2, 2));
+                               attribedge1 = _mm_sub_ss(_mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
+                               attribedge2 = _mm_sub_ss(_mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
+                               attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
+                               attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
+                               attriborigin = _mm_sub_ss(attriborigin, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
+                               cliporigin = _mm_cvtss_f32(attriborigin)*thread->clipplane[2] + thread->clipplane[3];
+                               clipxslope = thread->clipplane[0] + _mm_cvtss_f32(attribxslope)*thread->clipplane[2];
+                               clipyslope = thread->clipplane[1] + _mm_cvtss_f32(attribyslope)*thread->clipplane[2];
+                               if(clipxslope != 0)
+                               {
+                                       clip0origin = -cliporigin/clipxslope;
+                                       clip0slope = -clipyslope/clipxslope;
+                                       clip0dir = clipxslope > 0 ? 1 : -1;
+                               }
+                               else if(clipyslope > 0)
+                               {
+                                       clip0origin = dpsoftrast.fb_width*floor(cliporigin/clipyslope);
+                                       clip0slope = dpsoftrast.fb_width;
+                                       clip0dir = -1;
+                               }
+                               else if(clipyslope < 0)
+                               {
+                                       clip0origin = dpsoftrast.fb_width*ceil(cliporigin/clipyslope);
+                                       clip0slope = -dpsoftrast.fb_width;
+                                       clip0dir = -1;
+                               }
+                               else if(clip0origin < 0) continue;
+                       }
+
                        mipedgescale = _mm_setzero_ps();
                        for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
                        {
@@ -4990,6 +5134,7 @@ static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAS
                        int yccmask = _mm_movemask_epi8(ycc);
                        int edge0p, edge0n, edge1p, edge1n;
                        int nexty;
+                       float clip0;
                        if (numpoints == 4)
                        {
                                switch(yccmask)
@@ -5043,9 +5188,10 @@ static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAS
                                xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
                                xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
                        }
-                       for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
+                       clip0 = clip0origin + (y+0.5f)*clip0slope;
+                       for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope), clip0 += clip0slope)
                        {
-                               int startx, endx, offset;
+                               int startx, endx, clipx = minx, offset;
                                startx = _mm_cvtss_si32(xcoords);
                                endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
                                if (startx < minx) 
@@ -5055,13 +5201,32 @@ static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAS
                                }
                                if (endx > maxx) endx = maxx;
                                if (startx >= endx) continue;
+
+                               if (clip0dir)
+                               {
+                                       if (clip0dir > 0)
+                                       {
+                                               if (startx < clip0) 
+                                               {
+                                                       if(endx <= clip0) continue;
+                                                       clipx = max((int)clip0, minx);
+                                                       startx += (clipx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1); 
+                                               }
+                                       }
+                                       else if (endx > clip0) 
+                                       {
+                                               if(startx >= clip0) continue;
+                                               endx = (int)clip0;
+                                       }
+                               }
+                                               
                                for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
                                {
                                        DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
                                        span->triangle = thread->numtriangles;
                                        span->x = offset;
                                        span->y = y;
-                                       span->startx = max(minx - offset, 0);
+                                       span->startx = max(clipx - offset, 0);
                                        span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
                                        if (span->startx >= span->endx)
                                                continue; 
@@ -5251,6 +5416,7 @@ static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, i
                INTERPCOMMAND(UniformMatrix4f)
                INTERPCOMMAND(Uniform1i)
                INTERPCOMMAND(SetRenderTargets)
+               INTERPCOMMAND(ClipPlane)
 
                case DPSOFTRAST_OPCODE_Draw:
                        DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
@@ -5413,6 +5579,10 @@ int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsign
                thread->depthrange[1] = 1;
                thread->polygonoffset[0] = 0;
                thread->polygonoffset[1] = 0;
+               thread->clipplane[0] = 0;
+               thread->clipplane[1] = 0;
+               thread->clipplane[2] = 0;
+               thread->clipplane[3] = 1;
        
                DPSOFTRAST_RecalcThread(thread);