]> de.git.xonotic.org Git - xonotic/darkplaces.git/blobdiff - dpsoftrast.c
reworked collision cache to only be used by bouncegrid and only in
[xonotic/darkplaces.git] / dpsoftrast.c
index 954bfcdf5c16e15f3fd8a8cda2cbebcdd56f0156..b3a00ae417f34004967976d6c7278332b0daa0bf 100644 (file)
@@ -15,21 +15,35 @@ typedef qboolean bool;
 #endif
 
 #define ALIGN_SIZE 16
-#define ATOMIC_SIZE 32
+#define ATOMIC_SIZE 4
 
 #ifdef SSE_POSSIBLE
        #if defined(__APPLE__)
                #include <libkern/OSAtomic.h>
                #define ALIGN(var) var __attribute__((__aligned__(16)))
-               #define ATOMIC(var) var __attribute__((__aligned__(32)))
+               #define ATOMIC(var) var __attribute__((__aligned__(4)))
                #define MEMORY_BARRIER (_mm_sfence())
                #define ATOMIC_COUNTER volatile int32_t 
                #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
                #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
                #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
+       #elif defined(__GNUC__) && defined(WIN32)
+               #define ALIGN(var) var __attribute__((__aligned__(16)))
+               #define ATOMIC(var) var __attribute__((__aligned__(4)))
+               #define MEMORY_BARRIER (_mm_sfence())
+               //(__sync_synchronize())
+               #define ATOMIC_COUNTER volatile LONG
+               // this LONG * cast serves to fix an issue with broken mingw
+               // packages on Ubuntu; these only declare the function to take
+               // a LONG *, causing a compile error here. This seems to be
+               // error- and warn-free on platforms that DO declare
+               // InterlockedIncrement correctly, like mingw on Windows.
+               #define ATOMIC_INCREMENT(counter) (InterlockedIncrement((LONG *) &(counter)))
+               #define ATOMIC_DECREMENT(counter) (InterlockedDecrement((LONG *) &(counter)))
+               #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd((LONG *) &(counter), (val)))
        #elif defined(__GNUC__)
                #define ALIGN(var) var __attribute__((__aligned__(16)))
-               #define ATOMIC(var) var __attribute__((__aligned__(32)))
+               #define ATOMIC(var) var __attribute__((__aligned__(4)))
                #define MEMORY_BARRIER (_mm_sfence())
                //(__sync_synchronize())
                #define ATOMIC_COUNTER volatile int
@@ -38,7 +52,7 @@ typedef qboolean bool;
                #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
        #elif defined(_MSC_VER)
                #define ALIGN(var) __declspec(align(16)) var
-               #define ATOMIC(var) __declspec(align(32)) var
+               #define ATOMIC(var) __declspec(align(4)) var
                #define MEMORY_BARRIER (_mm_sfence())
                //(MemoryBarrier())
                #define ATOMIC_COUNTER volatile LONG
@@ -73,15 +87,15 @@ typedef qboolean bool;
 #ifdef SSE_POSSIBLE
 #include <emmintrin.h>
 
-#if defined(__GNUC__) && (__GNUC < 4 || __GNUC_MINOR__ < 6)
+#if defined(__GNUC__) && (__GNUC < 4 || __GNUC_MINOR__ < 6) && !defined(__clang__)
        #define _mm_cvtss_f32(val) (__builtin_ia32_vec_ext_v4sf ((__v4sf)(val), 0))
 #endif
 
-#define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
+#define MM_MALLOC(size) _mm_malloc(size, ALIGN_SIZE)
 
 static void *MM_CALLOC(size_t nmemb, size_t size)
 {
-       void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
+       void *ptr = _mm_malloc(nmemb*size, ALIGN_SIZE);
        if (ptr != NULL) memset(ptr, 0, nmemb*size);
        return ptr;
 }
@@ -149,15 +163,15 @@ enum { DPSOFTRAST_OPCODE_Reset = 0 };
 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
 
-typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
+typedef ALIGN(struct DPSOFTRAST_State_Command_Pool_s
 {
        int freecommand;
        int usedcommands;
-       ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
+       ALIGN(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
 }
 DPSOFTRAST_State_Command_Pool);
 
-typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
+typedef ALIGN(struct DPSOFTRAST_State_Triangle_s
 {
        unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
        float w[3];
@@ -222,7 +236,7 @@ typedef enum DPSOFTRAST_BLENDMODE_e
 }
 DPSOFTRAST_BLENDMODE;
 
-typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
+typedef ALIGN(struct DPSOFTRAST_State_Thread_s
 {
        void *thread;
        int index;
@@ -235,9 +249,6 @@ typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
        int depthtest;
        int depthfunc;
        int scissortest;
-       int alphatest;
-       int alphafunc;
-       float alphavalue;
        int viewport[4];
        int scissor[4];
        float depthrange[2];
@@ -291,7 +302,7 @@ typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
 }
 DPSOFTRAST_State_Thread);
 
-typedef ATOMIC(struct DPSOFTRAST_State_s
+typedef ALIGN(struct DPSOFTRAST_State_s
 {
        int fb_width;
        int fb_height;
@@ -1154,30 +1165,6 @@ void DPSOFTRAST_CullFace(int mode)
        command->mode = mode;
 }
 
-DEFCOMMAND(15, AlphaTest, int enable;)
-static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
-{
-       thread->alphatest = command->enable;
-}
-void DPSOFTRAST_AlphaTest(int enable)
-{
-       DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
-       command->enable = enable;
-}
-
-DEFCOMMAND(16, AlphaFunc, int func; float ref;)
-static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
-{
-       thread->alphafunc = command->func;
-       thread->alphavalue = command->ref;
-}
-void DPSOFTRAST_AlphaFunc(int func, float ref)
-{
-       DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
-       command->func = func;
-       command->ref = ref;
-}
-
 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
 {
        dpsoftrast.color[0] = r;
@@ -2062,7 +2049,7 @@ void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPS
        pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
        pixeli += span->y * dpsoftrast.fb_width + span->x;
        // handle alphatest now (this affects depth writes too)
-       if (thread->alphatest)
+       if (thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL)
                for (x = startx;x < endx;x++)
                        if (in4ub[x*4+3] < 128)
                                pixelmask[x] = false;
@@ -2297,166 +2284,63 @@ void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPS
 #endif
 }
 
-static void DPSOFTRAST_Texture2D(DPSOFTRAST_Texture *texture, int mip, float x, float y, float c[4])
-       // warning: this is SLOW, only use if the optimized per-span functions won't do
-       // FIXME does this function need flipping of the color order?
-{
-       const unsigned char * RESTRICT pixelbase;
-       const unsigned char * RESTRICT pixel[4];
-       int tciwrapmask[2];
-       tciwrapmask[0] = texture->mipmap[mip][2]-1;
-       tciwrapmask[1] = texture->mipmap[mip][3]-1;
-       pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
-       if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
-       {
-               if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
-               {
-                       unsigned int tc[2] = { x * (texture->mipmap[mip][2]<<12) - 2048, y * (texture->mipmap[mip][3]<<12) - 2048};
-                       unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
-                       unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
-                       unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
-                       int tci[2] = { tc[0]>>12, tc[1]>>12 };
-                       int tci1[2] = { tci[0] + 1, tci[1] + 1 };
-                       tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[mip][2]-1 ? tci[0] : texture->mipmap[mip][2]-1) : 0;
-                       tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[mip][3]-1 ? tci[1] : texture->mipmap[mip][3]-1) : 0;
-                       tci1[0] = tci1[0] >= 0 ? (tci1[0] <= texture->mipmap[mip][2]-1 ? tci1[0] : texture->mipmap[mip][2]-1) : 0;
-                       tci1[1] = tci1[1] >= 0 ? (tci1[1] <= texture->mipmap[mip][3]-1 ? tci1[1] : texture->mipmap[mip][3]-1) : 0;
-                       pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[mip][2]+tci[0]);
-                       pixel[1] = pixelbase + 4 * (tci[1]*texture->mipmap[mip][2]+tci1[0]);
-                       pixel[2] = pixelbase + 4 * (tci1[1]*texture->mipmap[mip][2]+tci[0]);
-                       pixel[3] = pixelbase + 4 * (tci1[1]*texture->mipmap[mip][2]+tci1[0]);
-                       c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF00000);
-                       c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF00000);
-                       c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF00000);
-                       c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF00000);
-               }
-               else
-               {
-                       unsigned int tc[2] = { x * (texture->mipmap[mip][2]<<12) - 2048, y * (texture->mipmap[mip][3]<<12) - 2048};
-                       unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
-                       unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
-                       unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
-                       int tci[2] = { tc[0]>>12, tc[1]>>12 };
-                       int tci1[2] = { tci[0] + 1, tci[1] + 1 };
-                       tci[0] &= tciwrapmask[0];
-                       tci[1] &= tciwrapmask[1];
-                       tci1[0] &= tciwrapmask[0];
-                       tci1[1] &= tciwrapmask[1];
-                       pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[mip][2]+tci[0]);
-                       pixel[1] = pixelbase + 4 * (tci[1]*texture->mipmap[mip][2]+tci1[0]);
-                       pixel[2] = pixelbase + 4 * (tci1[1]*texture->mipmap[mip][2]+tci[0]);
-                       pixel[3] = pixelbase + 4 * (tci1[1]*texture->mipmap[mip][2]+tci1[0]);
-                       c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF00000);
-                       c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF00000);
-                       c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF00000);
-                       c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF00000);
-               }
-       }
-       else
-       {
-               if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
-               {
-                       int tci[2] = { x * texture->mipmap[mip][2], y * texture->mipmap[mip][3] };
-                       tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[mip][2]-1 ? tci[0] : texture->mipmap[mip][2]-1) : 0;
-                       tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[mip][3]-1 ? tci[1] : texture->mipmap[mip][3]-1) : 0;
-                       pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[mip][2]+tci[0]);
-                       c[0] = pixel[0][0] * (1.0f / 255.0f);
-                       c[1] = pixel[0][1] * (1.0f / 255.0f);
-                       c[2] = pixel[0][2] * (1.0f / 255.0f);
-                       c[3] = pixel[0][3] * (1.0f / 255.0f);
-               }
-               else
-               {
-                       int tci[2] = { x * texture->mipmap[mip][2], y * texture->mipmap[mip][3] };
-                       tci[0] &= tciwrapmask[0];
-                       tci[1] &= tciwrapmask[1];
-                       pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[mip][2]+tci[0]);
-                       c[0] = pixel[0][0] * (1.0f / 255.0f);
-                       c[1] = pixel[0][1] * (1.0f / 255.0f);
-                       c[2] = pixel[0][2] * (1.0f / 255.0f);
-                       c[3] = pixel[0][3] * (1.0f / 255.0f);
-               }
-       }
-}
-
 static void DPSOFTRAST_Texture2DBGRA8(DPSOFTRAST_Texture *texture, int mip, float x, float y, unsigned char c[4])
        // warning: this is SLOW, only use if the optimized per-span functions won't do
 {
        const unsigned char * RESTRICT pixelbase;
        const unsigned char * RESTRICT pixel[4];
-       int tciwrapmask[2];
-       tciwrapmask[0] = texture->mipmap[mip][2]-1;
-       tciwrapmask[1] = texture->mipmap[mip][3]-1;
+       int width = texture->mipmap[mip][2], height = texture->mipmap[mip][3];
+       int wrapmask[2] = { width-1, height-1 };
        pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
        if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
        {
+               unsigned int tc[2] = { x * (width<<12) - 2048, y * (height<<12) - 2048};
+               unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
+               unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
+               unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
+               int tci[2] = { tc[0]>>12, tc[1]>>12 };
+               int tci1[2] = { tci[0] + 1, tci[1] + 1 };
                if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
                {
-                       unsigned int tc[2] = { x * (texture->mipmap[mip][2]<<12) - 2048, y * (texture->mipmap[mip][3]<<12) - 2048};
-                       unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
-                       unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
-                       unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
-                       int tci[2] = { tc[0]>>12, tc[1]>>12 };
-                       int tci1[2] = { tci[0] + 1, tci[1] + 1 };
-                       tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[mip][2]-1 ? tci[0] : texture->mipmap[mip][2]-1) : 0;
-                       tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[mip][3]-1 ? tci[1] : texture->mipmap[mip][3]-1) : 0;
-                       tci1[0] = tci1[0] >= 0 ? (tci1[0] <= texture->mipmap[mip][2]-1 ? tci1[0] : texture->mipmap[mip][2]-1) : 0;
-                       tci1[1] = tci1[1] >= 0 ? (tci1[1] <= texture->mipmap[mip][3]-1 ? tci1[1] : texture->mipmap[mip][3]-1) : 0;
-                       pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[mip][2]+tci[0]);
-                       pixel[1] = pixelbase + 4 * (tci[1]*texture->mipmap[mip][2]+tci1[0]);
-                       pixel[2] = pixelbase + 4 * (tci1[1]*texture->mipmap[mip][2]+tci[0]);
-                       pixel[3] = pixelbase + 4 * (tci1[1]*texture->mipmap[mip][2]+tci1[0]);
-                       c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
-                       c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
-                       c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
-                       c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3])>>24;
+                       tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
+                       tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
+                       tci1[0] = tci1[0] >= 0 ? (tci1[0] <= wrapmask[0] ? tci1[0] : wrapmask[0]) : 0;
+                       tci1[1] = tci1[1] >= 0 ? (tci1[1] <= wrapmask[1] ? tci1[1] : wrapmask[1]) : 0;
                }
                else
                {
-                       unsigned int tc[2] = { x * (texture->mipmap[mip][2]<<12) - 2048, y * (texture->mipmap[mip][3]<<12) - 2048};
-                       unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
-                       unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
-                       unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
-                       int tci[2] = { tc[0]>>12, tc[1]>>12 };
-                       int tci1[2] = { tci[0] + 1, tci[1] + 1 };
-                       tci[0] &= tciwrapmask[0];
-                       tci[1] &= tciwrapmask[1];
-                       tci1[0] &= tciwrapmask[0];
-                       tci1[1] &= tciwrapmask[1];
-                       pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[mip][2]+tci[0]);
-                       pixel[1] = pixelbase + 4 * (tci[1]*texture->mipmap[mip][2]+tci1[0]);
-                       pixel[2] = pixelbase + 4 * (tci1[1]*texture->mipmap[mip][2]+tci[0]);
-                       pixel[3] = pixelbase + 4 * (tci1[1]*texture->mipmap[mip][2]+tci1[0]);
-                       c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
-                       c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
-                       c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
-                       c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3])>>24;
+                       tci[0] &= wrapmask[0];
+                       tci[1] &= wrapmask[1];
+                       tci1[0] &= wrapmask[0];
+                       tci1[1] &= wrapmask[1];
                }
+               pixel[0] = pixelbase + 4 * (tci[1]*width+tci[0]);
+               pixel[1] = pixelbase + 4 * (tci[1]*width+tci1[0]);
+               pixel[2] = pixelbase + 4 * (tci1[1]*width+tci[0]);
+               pixel[3] = pixelbase + 4 * (tci1[1]*width+tci1[0]);
+               c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
+               c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
+               c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
+               c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3])>>24;
        }
        else
        {
+               int tci[2] = { x * width, y * height };
                if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
                {
-                       int tci[2] = { x * texture->mipmap[mip][2], y * texture->mipmap[mip][3] };
-                       tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[mip][2]-1 ? tci[0] : texture->mipmap[mip][2]-1) : 0;
-                       tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[mip][3]-1 ? tci[1] : texture->mipmap[mip][3]-1) : 0;
-                       pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[mip][2]+tci[0]);
-                       c[0] = pixel[0][0];
-                       c[1] = pixel[0][1];
-                       c[2] = pixel[0][2];
-                       c[3] = pixel[0][3];
+                       tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
+                       tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
                }
                else
                {
-                       int tci[2] = { x * texture->mipmap[mip][2], y * texture->mipmap[mip][3] };
-                       tci[0] &= tciwrapmask[0];
-                       tci[1] &= tciwrapmask[1];
-                       pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[mip][2]+tci[0]);
-                       c[0] = pixel[0][0];
-                       c[1] = pixel[0][1];
-                       c[2] = pixel[0][2];
-                       c[3] = pixel[0][3];
+                       tci[0] &= wrapmask[0];
+                       tci[1] &= wrapmask[1];
                }
+               pixel[0] = pixelbase + 4 * (tci[1]*width+tci[0]);
+               c[0] = pixel[0][0];
+               c[1] = pixel[0][1];
+               c[2] = pixel[0][2];
+               c[3] = pixel[0][3];
        }
 }
 
@@ -3422,7 +3306,7 @@ void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPS
        unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
        DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
        DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
-       if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
+       if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
                pixel = buffer_FragColorbgra8;
        Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
        Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
@@ -3475,7 +3359,7 @@ void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const D
        int arrayindex = DPSOFTRAST_ARRAY_COLOR;
        DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
        DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
-       if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
+       if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
                pixel = buffer_FragColorbgra8;
        Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
        Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
@@ -3550,7 +3434,7 @@ void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSO
        DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
        DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
        DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
-       if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
+       if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
                pixel = buffer_FragColorbgra8;
        Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
        Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
@@ -3973,8 +3857,8 @@ void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, cons
 
                                specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
                        }
+                       specular = pow(specular, 1.0f + SpecularPower * glosstex[3]);
 
-                       specular = pow(specular, SpecularPower * glosstex[3]);
                        if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
                        {
                                d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
@@ -4382,7 +4266,7 @@ void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const D
 
                                specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
                        }
-                       specular = pow(specular, SpecularPower * glosstex[3]);
+                       specular = pow(specular, 1.0f + SpecularPower * glosstex[3]);
 
                        if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
                        {
@@ -4682,9 +4566,9 @@ void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTR
        float EyeVectorslope[4];
 
        // uniforms
-       float ScreenScaleRefractReflect[2];
-       float ScreenCenterRefractReflect[2];
-       float DistortScaleRefractReflect[2];
+       float ScreenScaleRefractReflect[4];
+       float ScreenCenterRefractReflect[4];
+       float DistortScaleRefractReflect[4];
        float RefractColor[4];
        float ReflectColor[4];
        float ReflectFactor;
@@ -4855,6 +4739,8 @@ static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COU
        {2, DPSOFTRAST_VertexShader_FakeLight,                      DPSOFTRAST_PixelShader_FakeLight,                      {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
        {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace,   {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
        {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
+       {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
+       {2, DPSOFTRAST_VertexShader_VertexColor,                        DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
        {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,                 {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
        {2, DPSOFTRAST_VertexShader_LightSource,                    DPSOFTRAST_PixelShader_LightSource,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
        {2, DPSOFTRAST_VertexShader_Refraction,                     DPSOFTRAST_PixelShader_Refraction,                     {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
@@ -5575,8 +5461,6 @@ static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, i
                INTERPCOMMAND(DepthRange)
                INTERPCOMMAND(PolygonOffset)
                INTERPCOMMAND(CullFace)
-               INTERPCOMMAND(AlphaTest)
-               INTERPCOMMAND(AlphaFunc)
                INTERPCOMMAND(SetTexture)
                INTERPCOMMAND(SetShader)
                INTERPCOMMAND(Uniform4f)
@@ -5732,9 +5616,6 @@ int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsign
                thread->depthtest = true;
                thread->depthfunc = GL_LEQUAL;
                thread->scissortest = false;
-               thread->alphatest = false;
-               thread->alphafunc = GL_GREATER;
-               thread->alphavalue = 0.5f;
                thread->viewport[0] = 0;
                thread->viewport[1] = 0;
                thread->viewport[2] = dpsoftrast.fb_width;