+ __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
+ __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
+ __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
+ m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
+ m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
+ m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
+ m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
+ #define BBFRONT(k, pos) \
+ { \
+ DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
+ clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
+ if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
+ { \
+ __m128 proj; \
+ clipmask &= ~(1<<k); \
+ proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
+ minproj = _mm_min_ss(minproj, proj); \
+ maxproj = _mm_max_ss(maxproj, proj); \
+ } \
+ }
+ BBFRONT(0, minpos);
+ BBFRONT(1, _mm_move_ss(minpos, maxpos));
+ BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0)));
+ BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0)));
+ BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
+ BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
+ BBFRONT(6, _mm_move_ss(maxpos, minpos));
+ BBFRONT(7, maxpos);
+ #define BBCLIP(k) \
+ { \
+ if (clipmask&(1<<k)) \
+ { \
+ if (!(clipmask&(1<<(k^1)))) \
+ { \
+ __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
+ __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
+ proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
+ minproj = _mm_min_ss(minproj, proj); \
+ maxproj = _mm_max_ss(maxproj, proj); \
+ } \
+ if (!(clipmask&(1<<(k^2)))) \
+ { \
+ __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
+ __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
+ proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
+ minproj = _mm_min_ss(minproj, proj); \
+ maxproj = _mm_max_ss(maxproj, proj); \
+ } \
+ if (!(clipmask&(1<<(k^4)))) \
+ { \
+ __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
+ __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
+ proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
+ minproj = _mm_min_ss(minproj, proj); \
+ maxproj = _mm_max_ss(maxproj, proj); \
+ } \
+ } \
+ }
+ BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
+ viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
+ viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
+ minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
+ maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
+ minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
+ maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
+ *starty = _mm_cvttss_si32(maxproj);
+ *endy = _mm_cvttss_si32(minproj)+1;
+ return clipmask;
+}
+
+static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
+{
+ static const float identitymatrix16f[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
+ float *end = out4f + numitems*4;
+ __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
+ __m128 minpos, maxpos;
+ if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
+ {
+ minpos = maxpos = _mm_loadu_ps(in4f);
+ while (out4f < end)
+ {
+ __m128 v = _mm_loadu_ps(in4f);
+ minpos = _mm_min_ps(minpos, v);
+ maxpos = _mm_max_ps(maxpos, v);
+ _mm_store_ps(out4f, v);
+ DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
+ _mm_store_ps(screen4f, v);
+ in4f += 4;
+ out4f += 4;
+ screen4f += 4;
+ }
+ }
+ else
+ {
+ minpos = maxpos = _mm_load_ps(in4f);
+ while (out4f < end)
+ {
+ __m128 v = _mm_load_ps(in4f);
+ minpos = _mm_min_ps(minpos, v);
+ maxpos = _mm_max_ps(maxpos, v);
+ _mm_store_ps(out4f, v);
+ DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
+ _mm_store_ps(screen4f, v);
+ in4f += 4;
+ out4f += 4;
+ screen4f += 4;
+ }
+ }
+ if (starty && endy)
+ {
+ ALIGN(float minposf[4]);
+ ALIGN(float maxposf[4]);
+ _mm_store_ps(minposf, minpos);
+ _mm_store_ps(maxposf, maxpos);
+ return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix16f);
+ }
+ return 0;
+}
+
+static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
+{
+ static const float identitymatrix16f[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
+ __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
+ float *end;
+ if (!memcmp(identitymatrix16f, inmatrix16f, sizeof(float[16])))
+ return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
+ end = out4f + numitems*4;
+ viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
+ viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
+ m0 = _mm_loadu_ps(inmatrix16f);
+ m1 = _mm_loadu_ps(inmatrix16f + 4);
+ m2 = _mm_loadu_ps(inmatrix16f + 8);
+ m3 = _mm_loadu_ps(inmatrix16f + 12);
+ if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
+ {
+ minpos = maxpos = _mm_loadu_ps(in4f);
+ while (out4f < end)
+ {
+ __m128 v = _mm_loadu_ps(in4f);
+ minpos = _mm_min_ps(minpos, v);
+ maxpos = _mm_max_ps(maxpos, v);
+ DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
+ _mm_store_ps(out4f, v);
+ DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
+ _mm_store_ps(screen4f, v);
+ in4f += 4;
+ out4f += 4;
+ screen4f += 4;
+ }
+ }
+ else
+ {
+ minpos = maxpos = _mm_load_ps(in4f);
+ while (out4f < end)
+ {
+ __m128 v = _mm_load_ps(in4f);
+ minpos = _mm_min_ps(minpos, v);
+ maxpos = _mm_max_ps(maxpos, v);
+ DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
+ _mm_store_ps(out4f, v);
+ DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
+ _mm_store_ps(screen4f, v);
+ in4f += 4;
+ out4f += 4;
+ screen4f += 4;
+ }
+ }
+ if (starty && endy)
+ {
+ ALIGN(float minposf[4]);
+ ALIGN(float maxposf[4]);
+ _mm_store_ps(minposf, minpos);
+ _mm_store_ps(maxposf, maxpos);
+ return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f);
+ }
+ return 0;