+void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
+{
+ dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
+ dpsoftrast.components_texcoord[unitnum] = numcomponents;
+ dpsoftrast.stride_texcoord[unitnum] = stride;
+}
+
+DEFCOMMAND(18, SetShader, int mode; int permutation;)
+static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
+{
+ thread->shader_mode = command->mode;
+ thread->shader_permutation = command->permutation;
+}
+void DPSOFTRAST_SetShader(int mode, int permutation)
+{
+ DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
+ command->mode = mode;
+ command->permutation = permutation;
+
+ dpsoftrast.shader_mode = mode;
+ dpsoftrast.shader_permutation = permutation;
+}
+
+DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
+static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
+{
+ memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
+}
+void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
+{
+ DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
+ command->index = index;
+ command->val[0] = v0;
+ command->val[1] = v1;
+ command->val[2] = v2;
+ command->val[3] = v3;
+
+ dpsoftrast.uniform4f[index*4+0] = v0;
+ dpsoftrast.uniform4f[index*4+1] = v1;
+ dpsoftrast.uniform4f[index*4+2] = v2;
+ dpsoftrast.uniform4f[index*4+3] = v3;
+}
+void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
+{
+ DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
+ command->index = index;
+ memcpy(command->val, v, sizeof(command->val));
+
+ memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
+}
+
+DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
+static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
+{
+ memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
+}
+void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
+{
+#ifdef SSE2_PRESENT
+ int i, index;
+ for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
+ {
+ __m128 m0, m1, m2, m3;
+ DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
+ command->index = (DPSOFTRAST_UNIFORM)index;
+ if (((size_t)v)&(ALIGN_SIZE-1))
+ {
+ m0 = _mm_loadu_ps(v);
+ m1 = _mm_loadu_ps(v+4);
+ m2 = _mm_loadu_ps(v+8);
+ m3 = _mm_loadu_ps(v+12);
+ }
+ else
+ {
+ m0 = _mm_load_ps(v);
+ m1 = _mm_load_ps(v+4);
+ m2 = _mm_load_ps(v+8);
+ m3 = _mm_load_ps(v+12);
+ }
+ if (transpose)
+ {
+ __m128 t0, t1, t2, t3;
+ t0 = _mm_unpacklo_ps(m0, m1);
+ t1 = _mm_unpacklo_ps(m2, m3);
+ t2 = _mm_unpackhi_ps(m0, m1);
+ t3 = _mm_unpackhi_ps(m2, m3);
+ m0 = _mm_movelh_ps(t0, t1);
+ m1 = _mm_movehl_ps(t1, t0);
+ m2 = _mm_movelh_ps(t2, t3);
+ m3 = _mm_movehl_ps(t3, t2);
+ }
+ _mm_store_ps(command->val, m0);
+ _mm_store_ps(command->val+4, m1);
+ _mm_store_ps(command->val+8, m2);
+ _mm_store_ps(command->val+12, m3);
+ _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
+ _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
+ _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
+ _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
+ }
+#endif
+}
+
+DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
+static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
+{
+ thread->uniform1i[command->index] = command->val;
+}
+void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
+{
+ DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
+ command->index = index;
+ command->val = i0;
+
+ dpsoftrast.uniform1i[command->index] = i0;
+}
+
+#ifdef SSE2_PRESENT
+static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
+{
+ float *end = dst + size*4;
+ if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
+ {
+ while (dst < end)
+ {
+ _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
+ dst += 4;
+ src += stride;
+ }
+ }
+ else
+ {
+ while (dst < end)
+ {
+ _mm_store_ps(dst, _mm_load_ps((const float *)src));
+ dst += 4;
+ src += stride;
+ }
+ }
+}
+
+static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
+{
+ float *end = dst + size*4;
+ if (stride == sizeof(float[3]))
+ {
+ float *end4 = dst + (size&~3)*4;
+ if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
+ {
+ while (dst < end4)
+ {
+ __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv;
+ dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
+ dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
+ _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
+ dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
+ dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
+ _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
+ dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
+ dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
+ dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
+ _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
+ dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
+ _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
+ dst += 16;
+ src += 4*sizeof(float[3]);
+ }
+ }
+ else
+ {
+ while (dst < end4)
+ {
+ __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
+ dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
+ dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
+ _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
+ dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
+ dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
+ _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
+ dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
+ dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
+ dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
+ _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
+ dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
+ _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
+ dst += 16;
+ src += 4*sizeof(float[3]);
+ }
+ }
+ }
+ if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
+ {
+ while (dst < end)
+ {
+ __m128 v = _mm_loadu_ps((const float *)src);
+ v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
+ v = _mm_move_ss(v, _mm_set_ss(1.0f));
+ v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
+ _mm_store_ps(dst, v);
+ dst += 4;
+ src += stride;
+ }
+ }
+ else
+ {
+ while (dst < end)
+ {
+ __m128 v = _mm_load_ps((const float *)src);
+ v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
+ v = _mm_move_ss(v, _mm_set_ss(1.0f));
+ v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
+ _mm_store_ps(dst, v);
+ dst += 4;
+ src += stride;
+ }
+ }
+}
+
+static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
+{
+ float *end = dst + size*4;
+ __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
+ if (stride == sizeof(float[2]))
+ {
+ float *end2 = dst + (size&~1)*4;
+ if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
+ {
+ while (dst < end2)
+ {
+ __m128 v = _mm_loadu_ps((const float *)src);
+ _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
+ _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
+ dst += 8;
+ src += 2*sizeof(float[2]);
+ }
+ }
+ else
+ {
+ while (dst < end2)
+ {
+ __m128 v = _mm_load_ps((const float *)src);
+ _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
+ _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
+ dst += 8;
+ src += 2*sizeof(float[2]);
+ }
+ }
+ }
+ while (dst < end)
+ {
+ _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
+ dst += 4;
+ src += stride;
+ }
+}
+
+static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
+{
+ float *end = dst + size*4;
+ __m128 scale = _mm_set1_ps(1.0f/255.0f);
+ if (stride == sizeof(unsigned char[4]))
+ {
+ float *end4 = dst + (size&~3)*4;
+ if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
+ {
+ while (dst < end4)
+ {
+ __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
+ _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
+ _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
+ _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
+ _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
+ dst += 16;
+ src += 4*sizeof(unsigned char[4]);
+ }
+ }
+ else
+ {
+ while (dst < end4)
+ {
+ __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
+ _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
+ _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
+ _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
+ _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
+ dst += 16;
+ src += 4*sizeof(unsigned char[4]);
+ }
+ }
+ }
+ while (dst < end)
+ {
+ __m128i v = _mm_cvtsi32_si128(*(const int *)src);
+ _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
+ dst += 4;
+ src += stride;
+ }
+}
+
+static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
+{
+ float *end = dst + 4*size;
+ __m128 v = _mm_loadu_ps(src);
+ while (dst < end)
+ {
+ _mm_store_ps(dst, v);
+ dst += 4;
+ }
+}
+#endif
+
+void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
+{
+#ifdef SSE2_PRESENT
+ static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
+ __m128 m0, m1, m2, m3;
+ float *end;
+ if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
+ {
+ // fast case for identity matrix
+ if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
+ return;
+ }
+ end = out4f + numitems*4;
+ m0 = _mm_loadu_ps(inmatrix16f);
+ m1 = _mm_loadu_ps(inmatrix16f + 4);
+ m2 = _mm_loadu_ps(inmatrix16f + 8);
+ m3 = _mm_loadu_ps(inmatrix16f + 12);
+ if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
+ {
+ while (out4f < end)
+ {
+ __m128 v = _mm_loadu_ps(in4f);
+ _mm_store_ps(out4f,
+ _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
+ _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
+ _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
+ _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
+ out4f += 4;
+ in4f += 4;
+ }
+ }
+ else
+ {
+ while (out4f < end)
+ {
+ __m128 v = _mm_load_ps(in4f);
+ _mm_store_ps(out4f,
+ _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
+ _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
+ _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
+ _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
+ out4f += 4;
+ in4f += 4;
+ }
+ }
+#endif
+}
+
+void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)