]> de.git.xonotic.org Git - xonotic/darkplaces.git/commitdiff
initial obfuscation of ProcessTriangles with SSE2 code for testing
authoreihrul <eihrul@d7cf8633-e32d-0410-b094-e92efae38249>
Wed, 26 Jan 2011 13:00:02 +0000 (13:00 +0000)
committereihrul <eihrul@d7cf8633-e32d-0410-b094-e92efae38249>
Wed, 26 Jan 2011 13:00:02 +0000 (13:00 +0000)
git-svn-id: svn://svn.icculus.org/twilight/trunk/darkplaces@10752 d7cf8633-e32d-0410-b094-e92efae38249

dpsoftrast.c

index 884d454c1c2f1ef02688aac37190d60c5faed628..9f663e13ebb3e01d7ad991a8c1694da439d2bf09 100644 (file)
@@ -9,45 +9,32 @@
 typedef qboolean bool;
 #endif
 
-#define GL_NONE                                        0
-#define GL_FRONT_LEFT                  0x0400
-#define GL_FRONT_RIGHT                 0x0401
-#define GL_BACK_LEFT                   0x0402
-#define GL_BACK_RIGHT                  0x0403
-#define GL_FRONT                               0x0404
-#define GL_BACK                                        0x0405
-#define GL_LEFT                                        0x0406
-#define GL_RIGHT                               0x0407
-#define GL_FRONT_AND_BACK              0x0408
-#define GL_AUX0                                        0x0409
-#define GL_AUX1                                        0x040A
-#define GL_AUX2                                        0x040B
-#define GL_AUX3                                        0x040C
-
-#define GL_NEVER                               0x0200
-#define GL_LESS                                        0x0201
-#define GL_EQUAL                               0x0202
-#define GL_LEQUAL                              0x0203
-#define GL_GREATER                             0x0204
-#define GL_NOTEQUAL                            0x0205
-#define GL_GEQUAL                              0x0206
-#define GL_ALWAYS                              0x0207
-
-#define GL_ZERO                                        0x0
-#define GL_ONE                                 0x1
-#define GL_SRC_COLOR                           0x0300
-#define GL_ONE_MINUS_SRC_COLOR                 0x0301
-#define GL_DST_COLOR                           0x0306
-#define GL_ONE_MINUS_DST_COLOR                 0x0307
-#define GL_SRC_ALPHA                           0x0302
-#define GL_ONE_MINUS_SRC_ALPHA                 0x0303
-#define GL_DST_ALPHA                           0x0304
-#define GL_ONE_MINUS_DST_ALPHA                 0x0305
-#define GL_SRC_ALPHA_SATURATE                  0x0308
-#define GL_CONSTANT_COLOR                      0x8001
-#define GL_ONE_MINUS_CONSTANT_COLOR            0x8002
-#define GL_CONSTANT_ALPHA                      0x8003
-#define GL_ONE_MINUS_CONSTANT_ALPHA            0x8004
+#if defined(__GNUC__)
+#define ALIGN(var) var __attribute__((__aligned__(16)))
+#elif defined(_MSC_VER)
+#define ALIGN(var) __declspec(align(16)) var
+#else
+#define ALIGN(var) var
+#endif
+
+#ifdef SSE2_PRESENT
+#include <emmintrin.h>
+
+#define MM_MALLOC(size) _mm_malloc(size, 16)
+
+static void *MM_CALLOC(size_t nmemb, size_t size)
+{
+       void *ptr = _mm_malloc(nmemb*size, 16);
+       if(ptr != NULL) memset(ptr, 0, nmemb*size);
+       return ptr;
+}
+
+#define MM_FREE _mm_free
+#else
+#define MM_MALLOC(size) malloc(size)
+#define MM_CALLOC(nmemb, size) calloc(nmemb, size)
+#define MM_FREE free
+#endif
 
 typedef enum DPSOFTRAST_ARRAY_e
 {
@@ -103,7 +90,7 @@ DPSOFTRAST_State_User;
 
 #define DPSOFTRAST_MAXSUBSPAN 16
 
-typedef struct DPSOFTRAST_State_Draw_Span_s
+typedef ALIGN(struct DPSOFTRAST_State_Draw_Span_s
 {
        int start; // pixel index
        int length; // pixel count
@@ -116,9 +103,9 @@ typedef struct DPSOFTRAST_State_Draw_Span_s
        // [0][DPSOFTRAST_ARRAY_TOTAL][] is start screencoord4f
        // [1][DPSOFTRAST_ARRAY_TOTAL][] is end screencoord4f
        // NOTE: screencoord4f[3] is W (basically 1/Z), useful for depthbuffer
-       float data[2][DPSOFTRAST_ARRAY_TOTAL+1][4];
+       ALIGN(float data[2][DPSOFTRAST_ARRAY_TOTAL+1][4]);
 }
-DPSOFTRAST_State_Draw_Span;
+DPSOFTRAST_State_Draw_Span);
 
 #define DPSOFTRAST_DRAW_MAXSPANQUEUE 1024
 
@@ -453,7 +440,7 @@ int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
        texture->size = size;
 
        // allocate the pixels now
-       texture->bytes = (unsigned char *)calloc(1, size);
+       texture->bytes = (unsigned char *)MM_CALLOC(1, size);
 
        return texnum;
 }
@@ -462,7 +449,7 @@ void DPSOFTRAST_Texture_Free(int index)
        DPSOFTRAST_Texture *texture;
        texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
        if (texture->bytes)
-               free(texture->bytes);
+               MM_FREE(texture->bytes);
        texture->bytes = NULL;
        memset(texture, 0, sizeof(*texture));
        // adjust the free range and used range
@@ -979,8 +966,8 @@ void DPSOFTRAST_Draw_LoadVertices(int firstvertex, int numvertices, bool needcol
                while (dpsoftrast.draw.maxvertices < dpsoftrast.draw.numvertices)
                        dpsoftrast.draw.maxvertices *= 2;
                if (dpsoftrast.draw.in_array4f[0])
-                       free(dpsoftrast.draw.in_array4f[0]);
-               data = (float *)calloc(1, dpsoftrast.draw.maxvertices * sizeof(float[4])*(DPSOFTRAST_ARRAY_TOTAL*2 + 1));
+                       MM_FREE(dpsoftrast.draw.in_array4f[0]);
+               data = (float *)MM_CALLOC(1, dpsoftrast.draw.maxvertices * sizeof(float[4])*(DPSOFTRAST_ARRAY_TOTAL*2 + 1));
                for (i = 0;i < DPSOFTRAST_ARRAY_TOTAL;i++, data += dpsoftrast.draw.maxvertices * 4)
                        dpsoftrast.draw.in_array4f[i] = data;
                for (i = 0;i < DPSOFTRAST_ARRAY_TOTAL;i++, data += dpsoftrast.draw.maxvertices * 4)
@@ -3498,6 +3485,7 @@ void DPSOFTRAST_Draw_ProcessSpans(void)
 
 void DPSOFTRAST_Draw_ProcessTriangles(int firstvertex, int numtriangles, const int *element3i, const unsigned short *element3s, unsigned char *arraymask)
 {
+#ifdef SSE2_PRESENT
        int cullface = dpsoftrast.user.cullface;
        int width = dpsoftrast.fb_width;
        int height = dpsoftrast.fb_height;
@@ -3506,8 +3494,8 @@ void DPSOFTRAST_Draw_ProcessTriangles(int firstvertex, int numtriangles, const i
        int k;
        int y;
        int e[3];
-       int screenx[4];
-       int screeny[4];
+       ALIGN(int screeny[4]);
+    int starty, endy;
        int screenyless[4];
        int numpoints;
        int clipflags;
@@ -3515,7 +3503,6 @@ void DPSOFTRAST_Draw_ProcessTriangles(int firstvertex, int numtriangles, const i
        int edge0n;
        int edge1p;
        int edge1n;
-       int extent[6];
        int startx;
        int endx;
        float mip_edge0tc[2];
@@ -3539,16 +3526,15 @@ void DPSOFTRAST_Draw_ProcessTriangles(int firstvertex, int numtriangles, const i
        float spanilength;
        float startxlerp;
        float yc;
-       float w;
        float frac;
        float ifrac;
-       float trianglearea2;
-       float triangleedge[2][4];
-       float trianglenormal[4];
-       float clipdist[4];
-       float clipped[DPSOFTRAST_ARRAY_TOTAL][4][4];
-       float screen[4][4];
-       float proj[DPSOFTRAST_ARRAY_TOTAL][4][4];
+       //float trianglearea2;
+       __m128 triangleedge[2];
+       __m128 trianglenormal;
+       ALIGN(float clipdist[4]);
+       ALIGN(float clipped[DPSOFTRAST_ARRAY_TOTAL][4][4]);
+       ALIGN(float screen[4][4]);
+       __m128 proj[DPSOFTRAST_ARRAY_TOTAL][4];
        DPSOFTRAST_Texture *texture;
        DPSOFTRAST_State_Draw_Span *span;
        DPSOFTRAST_State_Draw_Span *oldspan;
@@ -3574,31 +3560,38 @@ void DPSOFTRAST_Draw_ProcessTriangles(int firstvertex, int numtriangles, const i
                        e[1] = i*3+1;
                        e[2] = i*3+2;
                }
-               triangleedge[0][0] = dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[0]*4+0] - dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[1]*4+0];
-               triangleedge[0][1] = dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[0]*4+1] - dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[1]*4+1];
-               triangleedge[0][2] = dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[0]*4+2] - dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[1]*4+2];
-               triangleedge[1][0] = dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[2]*4+0] - dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[1]*4+0];
-               triangleedge[1][1] = dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[2]*4+1] - dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[1]*4+1];
-               triangleedge[1][2] = dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[2]*4+2] - dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[1]*4+2];
+               {
+                       __m128 v0 = _mm_load_ps(&dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[1]*4]);
+                       triangleedge[0] = _mm_sub_ps(_mm_load_ps(&dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[0]*4]), v0);
+                       triangleedge[1] = _mm_sub_ps(_mm_load_ps(&dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[2]*4]), v0);
+               }
+               // store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar
+               trianglenormal = _mm_sub_ps(_mm_mul_ps(triangleedge[0], _mm_shuffle_ps(triangleedge[1], triangleedge[1], _MM_SHUFFLE(3, 0, 2, 1))),
+                                                                       _mm_mul_ps(_mm_shuffle_ps(triangleedge[0], triangleedge[0], _MM_SHUFFLE(3, 0, 2, 1)), triangleedge[1]));
+#if 0
+               trianglenormal[2] = triangleedge[0][0] * triangleedge[1][1] - triangleedge[0][1] * triangleedge[1][0];
                trianglenormal[0] = triangleedge[0][1] * triangleedge[1][2] - triangleedge[0][2] * triangleedge[1][1];
                trianglenormal[1] = triangleedge[0][2] * triangleedge[1][0] - triangleedge[0][0] * triangleedge[1][2];
-               trianglenormal[2] = triangleedge[0][0] * triangleedge[1][1] - triangleedge[0][1] * triangleedge[1][0];
-               trianglearea2 = trianglenormal[0] * trianglenormal[0] + trianglenormal[1] * trianglenormal[1] + trianglenormal[2] * trianglenormal[2];
-               // skip degenerate triangles, nothing good can come from them...
-               if (trianglearea2 == 0.0f)
-                       continue;
+#endif
                // apply current cullface mode (this culls many triangles)
                switch(cullface)
                {
                case GL_BACK:
-                       if (trianglenormal[2] < 0)
+                       if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps()))
                                continue;
                        break;
                case GL_FRONT:
-                       if (trianglenormal[2] > 0)
+                       if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps()))
                                continue;
                        break;
                }
+#if 0
+               trianglearea2 = trianglenormal[0] * trianglenormal[0] + trianglenormal[1] * trianglenormal[1] + trianglenormal[2] * trianglenormal[2];
+               // skip degenerate triangles, nothing good can come from them...
+               if (trianglearea2 == 0.0f)
+                       continue;
+#endif
+               
                // calculate distance from nearplane
                clipdist[0] = dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[0]*4+2] + 1.0f;
                clipdist[1] = dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[1]*4+2] + 1.0f;
@@ -3695,42 +3688,30 @@ void DPSOFTRAST_Draw_ProcessTriangles(int firstvertex, int numtriangles, const i
                        // triangle is entirely behind nearplane
                        continue;
                }
-               // calculate integer y coords for triangle points
-               screenx[0] = (int)(screen[0][0]);
-               screeny[0] = (int)(screen[0][1]);
-               screenx[1] = (int)(screen[1][0]);
-               screeny[1] = (int)(screen[1][1]);
-               screenx[2] = (int)(screen[2][0]);
-               screeny[2] = (int)(screen[2][1]);
-               screenx[3] = (int)(screen[3][0]);
-               screeny[3] = (int)(screen[3][1]);
-               // figure out the extents (bounding box) of the triangle
-               extent[0] = screenx[0];
-               extent[1] = screeny[0];
-               extent[2] = screenx[0];
-               extent[3] = screeny[0];
-               for (j = 1;j < numpoints;j++)
                {
-                       if (extent[0] > screenx[j]) extent[0] = screenx[j];
-                       if (extent[1] > screeny[j]) extent[1] = screeny[j];
-                       if (extent[2] < screenx[j]) extent[2] = screenx[j];
-                       if (extent[3] < screeny[j]) extent[3] = screeny[j];
+                       // calculate integer y coords for triangle points
+                       __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_shuffle_ps(_mm_load_ps(screen[0]), _mm_load_ps(screen[1]), _MM_SHUFFLE(1, 0, 1, 0))),
+                                                                                 _mm_cvttps_epi32(_mm_shuffle_ps(_mm_load_ps(screen[2]), _mm_load_ps(screen[3]), _MM_SHUFFLE(1, 0, 1, 0)))),
+                                       screenir, screenmin, screenmax;
+                       if (numpoints <= 3) screeni = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(2, 2, 1, 0));
+                       screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
+                       screenmin = _mm_min_epi16(screeni, screenir);
+                       screenmax = _mm_max_epi16(screeni, screenir);
+                       screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
+                       screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
+                       screenmin = _mm_max_epi16(screenmin, _mm_setzero_si128());
+                       screenmax = _mm_min_epi16(screenmax, _mm_setr_epi16(width-1, height-1, 0, 0, 0, 0, 0, 0));
+                       // skip offscreen triangles
+                       {
+                               __m128i cc = _mm_cmplt_epi16(screenmax, screenmin);
+                               if (_mm_extract_epi16(cc, 0)|_mm_extract_epi16(cc, 1))
+                                       continue;
+                       }
+                       starty = _mm_extract_epi16(screenmin, 1);
+                       endy = _mm_extract_epi16(screenmax, 1)+1;
+                       _mm_store_si128((__m128i *)screeny, _mm_srai_epi32(screeni, 16));
                }
-               //extent[0]--;
-               //extent[1]--;
-               extent[2]++;
-               extent[3]++;
-               if (extent[0] < 0)
-                       extent[0] = 0;
-               if (extent[1] < 0)
-                       extent[1] = 0;
-               if (extent[2] > width)
-                       extent[2] = width;
-               if (extent[3] > height)
-                       extent[3] = height;
-               // skip offscreen triangles
-               if (extent[2] <= extent[0] || extent[3] <= extent[1])
-                       continue;
+
                // okay, this triangle is going to produce spans, we'd better project
                // the interpolants now (this is what gives perspective texturing),
                // this consists of simply multiplying all arrays by the W coord
@@ -3743,11 +3724,7 @@ void DPSOFTRAST_Draw_ProcessTriangles(int firstvertex, int numtriangles, const i
                        {
                                for (k = 0;k < numpoints;k++)
                                {
-                                       w = screen[k][3];
-                                       proj[j][k][0] = clipped[j][k][0] * w;
-                                       proj[j][k][1] = clipped[j][k][1] * w;
-                                       proj[j][k][2] = clipped[j][k][2] * w;
-                                       proj[j][k][3] = clipped[j][k][3] * w;
+                                       proj[j][k] = _mm_mul_ps(_mm_load_ps(clipped[j][k]), _mm_set1_ps(screen[k][3]));
                                }
                        }
                }
@@ -3792,7 +3769,7 @@ void DPSOFTRAST_Draw_ProcessTriangles(int firstvertex, int numtriangles, const i
                // TODO: optimize?  the edges could have data slopes calculated
                // TODO: optimize?  the data slopes could be calculated as a plane
                //       (2D slopes) to avoid any interpolation along edges at all
-               for (y = extent[1];y < extent[3];y++)
+               for (y = starty;y < endy;y++)
                {
                        // get center of pixel y
                        yc = y;
@@ -3903,95 +3880,60 @@ void DPSOFTRAST_Draw_ProcessTriangles(int firstvertex, int numtriangles, const i
                        memcpy(span->mip, mip, sizeof(span->mip));
                        span->start = y * width + startx;
                        span->length = endx - startx;
-                       j = DPSOFTRAST_ARRAY_TOTAL;
-                       if (edge0xf < edge1xf)
                        {
-                               span->data[0][j][0] = screen[edge0p][0] * edge0yilerp + screen[edge0n][0] * edge0ylerp;
-                               span->data[0][j][1] = screen[edge0p][1] * edge0yilerp + screen[edge0n][1] * edge0ylerp;
-                               span->data[0][j][2] = screen[edge0p][2] * edge0yilerp + screen[edge0n][2] * edge0ylerp;
-                               span->data[0][j][3] = screen[edge0p][3] * edge0yilerp + screen[edge0n][3] * edge0ylerp;
-                               span->data[1][j][0] = screen[edge1p][0] * edge1yilerp + screen[edge1n][0] * edge1ylerp;
-                               span->data[1][j][1] = screen[edge1p][1] * edge1yilerp + screen[edge1n][1] * edge1ylerp;
-                               span->data[1][j][2] = screen[edge1p][2] * edge1yilerp + screen[edge1n][2] * edge1ylerp;
-                               span->data[1][j][3] = screen[edge1p][3] * edge1yilerp + screen[edge1n][3] * edge1ylerp;
-                               for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL;j++)
+                               __m128 edge0ylerpm = _mm_set1_ps(edge0ylerp), edge0yilerpm = _mm_set1_ps(edge0yilerp),
+                                          edge1ylerpm = _mm_set1_ps(edge1ylerp), edge1yilerpm = _mm_set1_ps(edge1yilerp),
+                                          spanilengthm = _mm_set1_ps(spanilength), startxlerpm = _mm_set1_ps(startxlerp),
+                                          data0, data1;
+                               j = DPSOFTRAST_ARRAY_TOTAL;
+                               if (edge0xf < edge1xf)
                                {
-                                       //if (arraymask[j])
+                                       data0 = _mm_add_ps(_mm_mul_ps(_mm_load_ps(screen[edge0p]), edge0yilerpm), _mm_mul_ps(_mm_load_ps(screen[edge0n]), edge0ylerpm));
+                                       data1 = _mm_add_ps(_mm_mul_ps(_mm_load_ps(screen[edge1p]), edge1yilerpm), _mm_mul_ps(_mm_load_ps(screen[edge1n]), edge1ylerpm));
+                                       data1 = _mm_mul_ps(_mm_sub_ps(data1, data0), spanilengthm);
+                                       data0 = _mm_add_ps(data0, _mm_mul_ps(data1, startxlerpm));
+                                       _mm_store_ps(span->data[0][j], data0);
+                                       _mm_store_ps(span->data[1][j], data1);
+                                       for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL;j++)
                                        {
-                                               span->data[0][j][0] = proj[j][edge0p][0] * edge0yilerp + proj[j][edge0n][0] * edge0ylerp;
-                                               span->data[0][j][1] = proj[j][edge0p][1] * edge0yilerp + proj[j][edge0n][1] * edge0ylerp;
-                                               span->data[0][j][2] = proj[j][edge0p][2] * edge0yilerp + proj[j][edge0n][2] * edge0ylerp;
-                                               span->data[0][j][3] = proj[j][edge0p][3] * edge0yilerp + proj[j][edge0n][3] * edge0ylerp;
-                                               span->data[1][j][0] = proj[j][edge1p][0] * edge1yilerp + proj[j][edge1n][0] * edge1ylerp;
-                                               span->data[1][j][1] = proj[j][edge1p][1] * edge1yilerp + proj[j][edge1n][1] * edge1ylerp;
-                                               span->data[1][j][2] = proj[j][edge1p][2] * edge1yilerp + proj[j][edge1n][2] * edge1ylerp;
-                                               span->data[1][j][3] = proj[j][edge1p][3] * edge1yilerp + proj[j][edge1n][3] * edge1ylerp;
+                                               //if (arraymask[j])
+                                               {
+                                                       data0 = _mm_add_ps(_mm_mul_ps(proj[j][edge0p], edge0yilerpm), _mm_mul_ps(proj[j][edge0n], edge0ylerpm));
+                                                       data1 = _mm_add_ps(_mm_mul_ps(proj[j][edge1p], edge1yilerpm), _mm_mul_ps(proj[j][edge1n], edge1ylerpm));
+                                                       data1 = _mm_mul_ps(_mm_sub_ps(data1, data0), spanilengthm);
+                                                       data0 = _mm_add_ps(data0, _mm_mul_ps(data1, startxlerpm));
+                                                       _mm_store_ps(span->data[0][j], data0);
+                                                       _mm_store_ps(span->data[1][j], data1);
+                                               }
                                        }
                                }
-                       }
-                       else
-                       {
-                               span->data[0][j][0] = screen[edge1p][0] * edge1yilerp + screen[edge1n][0] * edge1ylerp;
-                               span->data[0][j][1] = screen[edge1p][1] * edge1yilerp + screen[edge1n][1] * edge1ylerp;
-                               span->data[0][j][2] = screen[edge1p][2] * edge1yilerp + screen[edge1n][2] * edge1ylerp;
-                               span->data[0][j][3] = screen[edge1p][3] * edge1yilerp + screen[edge1n][3] * edge1ylerp;
-                               span->data[1][j][0] = screen[edge0p][0] * edge0yilerp + screen[edge0n][0] * edge0ylerp;
-                               span->data[1][j][1] = screen[edge0p][1] * edge0yilerp + screen[edge0n][1] * edge0ylerp;
-                               span->data[1][j][2] = screen[edge0p][2] * edge0yilerp + screen[edge0n][2] * edge0ylerp;
-                               span->data[1][j][3] = screen[edge0p][3] * edge0yilerp + screen[edge0n][3] * edge0ylerp;
-                               for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL;j++)
+                               else
                                {
-                                       //if (arraymask[j])
+                                       data0 = _mm_add_ps(_mm_mul_ps(_mm_load_ps(screen[edge1p]), edge1yilerpm), _mm_mul_ps(_mm_load_ps(screen[edge1n]), edge1ylerpm));
+                                       data1 = _mm_add_ps(_mm_mul_ps(_mm_load_ps(screen[edge0p]), edge0yilerpm), _mm_mul_ps(_mm_load_ps(screen[edge0n]), edge0ylerpm));
+                                       data1 = _mm_mul_ps(_mm_sub_ps(data1, data0), spanilengthm);
+                                       data0 = _mm_add_ps(data0, _mm_mul_ps(data1, startxlerpm));
+                                       _mm_store_ps(span->data[0][j], data0);
+                                       _mm_store_ps(span->data[1][j], data1);
+                                       for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL;j++)
                                        {
-                                               span->data[0][j][0] = proj[j][edge1p][0] * edge1yilerp + proj[j][edge1n][0] * edge1ylerp;
-                                               span->data[0][j][1] = proj[j][edge1p][1] * edge1yilerp + proj[j][edge1n][1] * edge1ylerp;
-                                               span->data[0][j][2] = proj[j][edge1p][2] * edge1yilerp + proj[j][edge1n][2] * edge1ylerp;
-                                               span->data[0][j][3] = proj[j][edge1p][3] * edge1yilerp + proj[j][edge1n][3] * edge1ylerp;
-                                               span->data[1][j][0] = proj[j][edge0p][0] * edge0yilerp + proj[j][edge0n][0] * edge0ylerp;
-                                               span->data[1][j][1] = proj[j][edge0p][1] * edge0yilerp + proj[j][edge0n][1] * edge0ylerp;
-                                               span->data[1][j][2] = proj[j][edge0p][2] * edge0yilerp + proj[j][edge0n][2] * edge0ylerp;
-                                               span->data[1][j][3] = proj[j][edge0p][3] * edge0yilerp + proj[j][edge0n][3] * edge0ylerp;
+                                               //if (arraymask[j])
+                                               {
+                                                       data0 = _mm_add_ps(_mm_mul_ps(proj[j][edge1p], edge1yilerpm), _mm_mul_ps(proj[j][edge1n], edge1ylerpm));
+                                                       data1 = _mm_add_ps(_mm_mul_ps(proj[j][edge0p], edge0yilerpm), _mm_mul_ps(proj[j][edge0n], edge0ylerpm));
+                                                       data1 = _mm_mul_ps(_mm_sub_ps(data1, data0), spanilengthm);
+                                                       data0 = _mm_add_ps(data0, _mm_mul_ps(data1, startxlerpm));
+                                                       _mm_store_ps(span->data[0][j], data0);
+                                                       _mm_store_ps(span->data[1][j], data1);
+                                               }
                                        }
                                }
                        }
-                       // change data[1][n][] to be a data slope
-                       j = DPSOFTRAST_ARRAY_TOTAL;
-                       span->data[1][j][0] = (span->data[1][j][0] - span->data[0][j][0]) * spanilength;
-                       span->data[1][j][1] = (span->data[1][j][1] - span->data[0][j][1]) * spanilength;
-                       span->data[1][j][2] = (span->data[1][j][2] - span->data[0][j][2]) * spanilength;
-                       span->data[1][j][3] = (span->data[1][j][3] - span->data[0][j][3]) * spanilength;
-                       for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL;j++)
-                       {
-                               //if (arraymask[j])
-                               {
-                                       span->data[1][j][0] = (span->data[1][j][0] - span->data[0][j][0]) * spanilength;
-                                       span->data[1][j][1] = (span->data[1][j][1] - span->data[0][j][1]) * spanilength;
-                                       span->data[1][j][2] = (span->data[1][j][2] - span->data[0][j][2]) * spanilength;
-                                       span->data[1][j][3] = (span->data[1][j][3] - span->data[0][j][3]) * spanilength;
-                               }
-                       }
-                       // adjust the data[0][n][] to be correct for the pixel centers
-                       // this also handles horizontal clipping where a major part of the
-                       // span may be off the left side of the screen
-                       j = DPSOFTRAST_ARRAY_TOTAL;
-                       span->data[0][j][0] += span->data[1][j][0] * startxlerp;
-                       span->data[0][j][1] += span->data[1][j][1] * startxlerp;
-                       span->data[0][j][2] += span->data[1][j][2] * startxlerp;
-                       span->data[0][j][3] += span->data[1][j][3] * startxlerp;
-                       for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL;j++)
-                       {
-                               //if (arraymask[j])
-                               {
-                                       span->data[0][j][0] += span->data[1][j][0] * startxlerp;
-                                       span->data[0][j][1] += span->data[1][j][1] * startxlerp;
-                                       span->data[0][j][2] += span->data[1][j][2] * startxlerp;
-                                       span->data[0][j][3] += span->data[1][j][3] * startxlerp;
-                               }
-                       }
                        // to keep the shader routines from needing more than a small
                        // buffer for pixel intermediate data, we split long spans...
                        while (span->length > DPSOFTRAST_DRAW_MAXSPANLENGTH)
                        {
+                               __m128 maxspanlengthm;
                                span->length = DPSOFTRAST_DRAW_MAXSPANLENGTH;
                                if (dpsoftrast.draw.numspans >= DPSOFTRAST_DRAW_MAXSPANQUEUE)
                                {
@@ -4005,18 +3947,13 @@ void DPSOFTRAST_Draw_ProcessTriangles(int firstvertex, int numtriangles, const i
                                span->start = y * width + startx;
                                span->length = endx - startx;
                                j = DPSOFTRAST_ARRAY_TOTAL;
-                               span->data[0][j][0] += span->data[1][j][0] * DPSOFTRAST_DRAW_MAXSPANLENGTH;
-                               span->data[0][j][1] += span->data[1][j][1] * DPSOFTRAST_DRAW_MAXSPANLENGTH;
-                               span->data[0][j][2] += span->data[1][j][2] * DPSOFTRAST_DRAW_MAXSPANLENGTH;
-                               span->data[0][j][3] += span->data[1][j][3] * DPSOFTRAST_DRAW_MAXSPANLENGTH;
+                               maxspanlengthm = _mm_set1_ps(DPSOFTRAST_DRAW_MAXSPANLENGTH);
+                               _mm_store_ps(span->data[0][j], _mm_add_ps(_mm_load_ps(span->data[0][j]), _mm_mul_ps(_mm_load_ps(span->data[1][j]), maxspanlengthm)));
                                for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL;j++)
                                {
                                        //if (arraymask[j])
                                        {
-                                               span->data[0][j][0] += span->data[1][j][0] * DPSOFTRAST_DRAW_MAXSPANLENGTH;
-                                               span->data[0][j][1] += span->data[1][j][1] * DPSOFTRAST_DRAW_MAXSPANLENGTH;
-                                               span->data[0][j][2] += span->data[1][j][2] * DPSOFTRAST_DRAW_MAXSPANLENGTH;
-                                               span->data[0][j][3] += span->data[1][j][3] * DPSOFTRAST_DRAW_MAXSPANLENGTH;
+                                                _mm_store_ps(span->data[0][j], _mm_add_ps(_mm_load_ps(span->data[0][j]), _mm_mul_ps(_mm_load_ps(span->data[1][j]), maxspanlengthm)));
                                        }
                                }
                        }
@@ -4036,6 +3973,7 @@ void DPSOFTRAST_Draw_ProcessTriangles(int firstvertex, int numtriangles, const i
                DPSOFTRAST_Draw_ProcessSpans();
                dpsoftrast.draw.numspans = 0;
        }
+#endif
 }
 
 void DPSOFTRAST_Draw_DebugPoints(void)