X-Git-Url: https://de.git.xonotic.org/?p=voretournament%2Fvoretournament.git;a=blobdiff_plain;f=misc%2Fsource%2Fdarkplaces-src%2Fmod_skeletal_animatevertices_sse.c;fp=misc%2Fsource%2Fdarkplaces-src%2Fmod_skeletal_animatevertices_sse.c;h=648ab31a15460b7f333e6a8686d8df28bbbc554c;hp=f575d24522db67e7b2d3f604e639905f8d6706c4;hb=1d6d760b32b75b7915660035fd9e949f83340e96;hpb=1398386f00445187817671c1a87907362f14004d diff --git a/misc/source/darkplaces-src/mod_skeletal_animatevertices_sse.c b/misc/source/darkplaces-src/mod_skeletal_animatevertices_sse.c index f575d245..648ab31a 100644 --- a/misc/source/darkplaces-src/mod_skeletal_animatevertices_sse.c +++ b/misc/source/darkplaces-src/mod_skeletal_animatevertices_sse.c @@ -16,7 +16,6 @@ void Mod_Skeletal_AnimateVertices_SSE(const dp_model_t * RESTRICT model, const f matrix4x4_t *bonepose; matrix4x4_t *boneposerelative; float m[12]; - matrix4x4_t mm, mm2; const blendweights_t * RESTRICT weights; int num_vertices_minus_one; @@ -34,69 +33,189 @@ void Mod_Skeletal_AnimateVertices_SSE(const dp_model_t * RESTRICT model, const f { for (i = 0;i < model->num_bones;i++) { - // relativetransforms is in GL column-major order, which is what we need for SSE - // transposed style processing + const float * RESTRICT n = model->data_baseboneposeinverse + i * 12; + matrix4x4_t * RESTRICT s = &skeleton->relativetransforms[i]; + matrix4x4_t * RESTRICT b = &bonepose[i]; + matrix4x4_t * RESTRICT r = &boneposerelative[i]; + __m128 b0, b1, b2, b3, r0, r1, r2, r3, nr; if (model->data_bones[i].parent >= 0) - Matrix4x4_Concat(&bonepose[i], &bonepose[model->data_bones[i].parent], &skeleton->relativetransforms[i]); + { + const matrix4x4_t * RESTRICT p = &bonepose[model->data_bones[i].parent]; + __m128 s0 = _mm_loadu_ps(s->m[0]), s1 = _mm_loadu_ps(s->m[1]), s2 = _mm_loadu_ps(s->m[2]); +#ifdef OPENGLORIENTATION + __m128 s3 = _mm_loadu_ps(s->m[3]); +#define SKELETON_MATRIX(r, c) _mm_shuffle_ps(s##c, s##c, _MM_SHUFFLE(r, r, r, r)) +#else +#define SKELETON_MATRIX(r, c) _mm_shuffle_ps(s##r, s##r, _MM_SHUFFLE(c, c, c, c)) +#endif + __m128 pr = _mm_load_ps(p->m[0]); + b0 = _mm_mul_ps(pr, SKELETON_MATRIX(0, 0)); + b1 = _mm_mul_ps(pr, SKELETON_MATRIX(0, 1)); + b2 = _mm_mul_ps(pr, SKELETON_MATRIX(0, 2)); + b3 = _mm_mul_ps(pr, SKELETON_MATRIX(0, 3)); + pr = _mm_load_ps(p->m[1]); + b0 = _mm_add_ps(b0, _mm_mul_ps(pr, SKELETON_MATRIX(1, 0))); + b1 = _mm_add_ps(b1, _mm_mul_ps(pr, SKELETON_MATRIX(1, 1))); + b2 = _mm_add_ps(b2, _mm_mul_ps(pr, SKELETON_MATRIX(1, 2))); + b3 = _mm_add_ps(b3, _mm_mul_ps(pr, SKELETON_MATRIX(1, 3))); + pr = _mm_load_ps(p->m[2]); + b0 = _mm_add_ps(b0, _mm_mul_ps(pr, SKELETON_MATRIX(2, 0))); + b1 = _mm_add_ps(b1, _mm_mul_ps(pr, SKELETON_MATRIX(2, 1))); + b2 = _mm_add_ps(b2, _mm_mul_ps(pr, SKELETON_MATRIX(2, 2))); + b3 = _mm_add_ps(b3, _mm_mul_ps(pr, SKELETON_MATRIX(2, 3))); + b3 = _mm_add_ps(b3, _mm_load_ps(p->m[3])); + } else - memcpy(&bonepose[i], &skeleton->relativetransforms[i], sizeof(matrix4x4_t)); - - // create a relative deformation matrix to describe displacement - // from the base mesh, which is used by the actual weighting - Matrix4x4_FromArray12FloatD3D(&mm, model->data_baseboneposeinverse + i * 12); // baseboneposeinverse is 4x3 row-major - Matrix4x4_Concat(&mm2, &bonepose[i], &mm); - Matrix4x4_Transpose(&boneposerelative[i], &mm2); // TODO: Eliminate this transpose + { + b0 = _mm_loadu_ps(s->m[0]); + b1 = _mm_loadu_ps(s->m[1]); + b2 = _mm_loadu_ps(s->m[2]); + b3 = _mm_loadu_ps(s->m[3]); +#ifndef OPENGLORIENTATION + _MM_TRANSPOSE4_PS(b0, b1, b2, b3); +#endif + } + _mm_store_ps(b->m[0], b0); + _mm_store_ps(b->m[1], b1); + _mm_store_ps(b->m[2], b2); + _mm_store_ps(b->m[3], b3); + nr = _mm_loadu_ps(n); + r0 = _mm_mul_ps(b0, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(0, 0, 0, 0))); + r1 = _mm_mul_ps(b0, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(1, 1, 1, 1))); + r2 = _mm_mul_ps(b0, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(2, 2, 2, 2))); + r3 = _mm_mul_ps(b0, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(3, 3, 3, 3))); + nr = _mm_loadu_ps(n+4); + r0 = _mm_add_ps(r0, _mm_mul_ps(b1, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(0, 0, 0, 0)))); + r1 = _mm_add_ps(r1, _mm_mul_ps(b1, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(1, 1, 1, 1)))); + r2 = _mm_add_ps(r2, _mm_mul_ps(b1, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(2, 2, 2, 2)))); + r3 = _mm_add_ps(r3, _mm_mul_ps(b1, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(3, 3, 3, 3)))); + nr = _mm_loadu_ps(n+8); + r0 = _mm_add_ps(r0, _mm_mul_ps(b2, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(0, 0, 0, 0)))); + r1 = _mm_add_ps(r1, _mm_mul_ps(b2, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(1, 1, 1, 1)))); + r2 = _mm_add_ps(r2, _mm_mul_ps(b2, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(2, 2, 2, 2)))); + r3 = _mm_add_ps(r3, _mm_mul_ps(b2, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(3, 3, 3, 3)))); + r3 = _mm_add_ps(r3, b3); + _mm_store_ps(r->m[0], r0); + _mm_store_ps(r->m[1], r1); + _mm_store_ps(r->m[2], r2); + _mm_store_ps(r->m[3], r3); } } else { - float originscale = model->num_posescale; - float x,y,z,w,lerp; - const short * RESTRICT pose6s; - for (i = 0;i < model->num_bones;i++) { - memset(m, 0, sizeof(m)); - for (blends = 0;blends < MAX_FRAMEBLENDS && frameblend[blends].lerp > 0;blends++) + const short * RESTRICT pose7s = model->data_poses7s + 7 * (frameblend[0].subframe * model->num_bones + i); + float lerp = frameblend[0].lerp, + tx = pose7s[0], ty = pose7s[1], tz = pose7s[2], + rx = pose7s[3] * lerp, + ry = pose7s[4] * lerp, + rz = pose7s[5] * lerp, + rw = pose7s[6] * lerp, + dx = tx*rw + ty*rz - tz*ry, + dy = -tx*rz + ty*rw + tz*rx, + dz = tx*ry - ty*rx + tz*rw, + dw = -tx*rx - ty*ry - tz*rz, + scale, sx, sy, sz, sw; + for (blends = 1;blends < MAX_FRAMEBLENDS && frameblend[blends].lerp > 0;blends++) { - pose6s = model->data_poses6s + 6 * (frameblend[blends].subframe * model->num_bones + i); - lerp = frameblend[blends].lerp; - x = pose6s[3] * (1.0f / 32767.0f); - y = pose6s[4] * (1.0f / 32767.0f); - z = pose6s[5] * (1.0f / 32767.0f); - w = 1.0f - (x*x+y*y+z*z); - w = w > 0.0f ? -sqrt(w) : 0.0f; - m[ 0] += (1-2*(y*y+z*z)) * lerp; - m[ 1] += ( 2*(x*y-z*w)) * lerp; - m[ 2] += ( 2*(x*z+y*w)) * lerp; - m[ 3] += (pose6s[0] * originscale) * lerp; - m[ 4] += ( 2*(x*y+z*w)) * lerp; - m[ 5] += (1-2*(x*x+z*z)) * lerp; - m[ 6] += ( 2*(y*z-x*w)) * lerp; - m[ 7] += (pose6s[1] * originscale) * lerp; - m[ 8] += ( 2*(x*z-y*w)) * lerp; - m[ 9] += ( 2*(y*z+x*w)) * lerp; - m[10] += (1-2*(x*x+y*y)) * lerp; - m[11] += (pose6s[2] * originscale) * lerp; + const short * RESTRICT pose7s = model->data_poses7s + 7 * (frameblend[blends].subframe * model->num_bones + i); + float lerp = frameblend[blends].lerp, + tx = pose7s[0], ty = pose7s[1], tz = pose7s[2], + qx = pose7s[3], qy = pose7s[4], qz = pose7s[5], qw = pose7s[6]; + if(rx*qx + ry*qy + rz*qz + rw*qw < 0) lerp = -lerp; + qx *= lerp; + qy *= lerp; + qz *= lerp; + qw *= lerp; + rx += qx; + ry += qy; + rz += qz; + rw += qw; + dx += tx*qw + ty*qz - tz*qy; + dy += -tx*qz + ty*qw + tz*qx; + dz += tx*qy - ty*qx + tz*qw; + dw += -tx*qx - ty*qy - tz*qz; } - VectorNormalize(m ); - VectorNormalize(m + 4); - VectorNormalize(m + 8); + scale = 1.0f / (rx*rx + ry*ry + rz*rz + rw*rw); + sx = rx * scale; + sy = ry * scale; + sz = rz * scale; + sw = rw * scale; + m[0] = sw*rw + sx*rx - sy*ry - sz*rz; + m[1] = 2*(sx*ry - sw*rz); + m[2] = 2*(sx*rz + sw*ry); + m[3] = model->num_posescale*(dx*sw - dy*sz + dz*sy - dw*sx); + m[4] = 2*(sx*ry + sw*rz); + m[5] = sw*rw + sy*ry - sx*rx - sz*rz; + m[6] = 2*(sy*rz - sw*rx); + m[7] = model->num_posescale*(dx*sz + dy*sw - dz*sx - dw*sy); + m[8] = 2*(sx*rz - sw*ry); + m[9] = 2*(sy*rz + sw*rx); + m[10] = sw*rw + sz*rz - sx*rx - sy*ry; + m[11] = model->num_posescale*(dy*sx + dz*sw - dx*sy - dw*sz); if (i == r_skeletal_debugbone.integer) m[r_skeletal_debugbonecomponent.integer % 12] += r_skeletal_debugbonevalue.value; m[3] *= r_skeletal_debugtranslatex.value; m[7] *= r_skeletal_debugtranslatey.value; m[11] *= r_skeletal_debugtranslatez.value; - Matrix4x4_FromArray12FloatD3D(&mm, m); - if (model->data_bones[i].parent >= 0) - Matrix4x4_Concat(&bonepose[i], &bonepose[model->data_bones[i].parent], &mm); - else - memcpy(&bonepose[i], &mm, sizeof(mm)); - // create a relative deformation matrix to describe displacement - // from the base mesh, which is used by the actual weighting - Matrix4x4_FromArray12FloatD3D(&mm, model->data_baseboneposeinverse + i * 12); // baseboneposeinverse is 4x3 row-major - Matrix4x4_Concat(&mm2, &bonepose[i], &mm); - Matrix4x4_Transpose(&boneposerelative[i], &mm2); // TODO: Eliminate this transpose + { + const float * RESTRICT n = model->data_baseboneposeinverse + i * 12; + matrix4x4_t * RESTRICT b = &bonepose[i]; + matrix4x4_t * RESTRICT r = &boneposerelative[i]; + __m128 b0, b1, b2, b3, r0, r1, r2, r3, nr; + if (model->data_bones[i].parent >= 0) + { + const matrix4x4_t * RESTRICT p = &bonepose[model->data_bones[i].parent]; + __m128 pr = _mm_load_ps(p->m[0]); + b0 = _mm_mul_ps(pr, _mm_set1_ps(m[0])); + b1 = _mm_mul_ps(pr, _mm_set1_ps(m[1])); + b2 = _mm_mul_ps(pr, _mm_set1_ps(m[2])); + b3 = _mm_mul_ps(pr, _mm_set1_ps(m[3])); + pr = _mm_load_ps(p->m[1]); + b0 = _mm_add_ps(b0, _mm_mul_ps(pr, _mm_set1_ps(m[4]))); + b1 = _mm_add_ps(b1, _mm_mul_ps(pr, _mm_set1_ps(m[5]))); + b2 = _mm_add_ps(b2, _mm_mul_ps(pr, _mm_set1_ps(m[6]))); + b3 = _mm_add_ps(b3, _mm_mul_ps(pr, _mm_set1_ps(m[7]))); + pr = _mm_load_ps(p->m[2]); + b0 = _mm_add_ps(b0, _mm_mul_ps(pr, _mm_set1_ps(m[8]))); + b1 = _mm_add_ps(b1, _mm_mul_ps(pr, _mm_set1_ps(m[9]))); + b2 = _mm_add_ps(b2, _mm_mul_ps(pr, _mm_set1_ps(m[10]))); + b3 = _mm_add_ps(b3, _mm_mul_ps(pr, _mm_set1_ps(m[11]))); + b3 = _mm_add_ps(b3, _mm_load_ps(p->m[3])); + } + else + { + b0 = _mm_setr_ps(m[0], m[4], m[8], 0.0f); + b1 = _mm_setr_ps(m[1], m[5], m[9], 0.0f); + b2 = _mm_setr_ps(m[2], m[6], m[10], 0.0f); + b3 = _mm_setr_ps(m[3], m[7], m[11], 1.0f); + } + _mm_store_ps(b->m[0], b0); + _mm_store_ps(b->m[1], b1); + _mm_store_ps(b->m[2], b2); + _mm_store_ps(b->m[3], b3); + nr = _mm_loadu_ps(n); + r0 = _mm_mul_ps(b0, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(0, 0, 0, 0))); + r1 = _mm_mul_ps(b0, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(1, 1, 1, 1))); + r2 = _mm_mul_ps(b0, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(2, 2, 2, 2))); + r3 = _mm_mul_ps(b0, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(3, 3, 3, 3))); + nr = _mm_loadu_ps(n+4); + r0 = _mm_add_ps(r0, _mm_mul_ps(b1, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(0, 0, 0, 0)))); + r1 = _mm_add_ps(r1, _mm_mul_ps(b1, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(1, 1, 1, 1)))); + r2 = _mm_add_ps(r2, _mm_mul_ps(b1, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(2, 2, 2, 2)))); + r3 = _mm_add_ps(r3, _mm_mul_ps(b1, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(3, 3, 3, 3)))); + nr = _mm_loadu_ps(n+8); + r0 = _mm_add_ps(r0, _mm_mul_ps(b2, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(0, 0, 0, 0)))); + r1 = _mm_add_ps(r1, _mm_mul_ps(b2, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(1, 1, 1, 1)))); + r2 = _mm_add_ps(r2, _mm_mul_ps(b2, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(2, 2, 2, 2)))); + r3 = _mm_add_ps(r3, _mm_mul_ps(b2, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(3, 3, 3, 3)))); + r3 = _mm_add_ps(r3, b3); + _mm_store_ps(r->m[0], r0); + _mm_store_ps(r->m[1], r1); + _mm_store_ps(r->m[2], r2); + _mm_store_ps(r->m[3], r3); + } } }