use dual quaternions for frame blends
[xonotic/darkplaces.git] / mod_skeletal_animatevertices_sse.c
1 #include "mod_skeletal_animatevertices_sse.h"
2
3 #ifdef SSE_POSSIBLE
4
5 #ifdef MATRIX4x4_OPENGLORIENTATION
6 #error "SSE skeletal requires D3D matrix layout"
7 #endif
8
9 #include <xmmintrin.h>
10
11 void Mod_Skeletal_AnimateVertices_SSE(const dp_model_t * RESTRICT model, const frameblend_t * RESTRICT frameblend, const skeleton_t *skeleton, float * RESTRICT vertex3f, float * RESTRICT normal3f, float * RESTRICT svector3f, float * RESTRICT tvector3f)
12 {
13         // vertex weighted skeletal
14         int i, k;
15         int blends;
16         matrix4x4_t *bonepose;
17         matrix4x4_t *boneposerelative;
18         float m[12];
19         matrix4x4_t mm, mm2;
20         const blendweights_t * RESTRICT weights;
21         int num_vertices_minus_one;
22
23         num_vertices_minus_one = model->surfmesh.num_vertices - 1;
24
25         //unsigned long long ts = rdtsc();
26         bonepose = (matrix4x4_t *) Mod_Skeletal_AnimateVertices_AllocBuffers(sizeof(matrix4x4_t) * (model->num_bones*2 + model->surfmesh.num_blends));
27         boneposerelative = bonepose + model->num_bones;
28
29         if (skeleton && !skeleton->relativetransforms)
30                 skeleton = NULL;
31
32         // interpolate matrices
33         if (skeleton)
34         {
35                 for (i = 0;i < model->num_bones;i++)
36                 {
37                         // relativetransforms is in GL column-major order, which is what we need for SSE
38                         // transposed style processing
39                         if (model->data_bones[i].parent >= 0)
40                                 Matrix4x4_Concat(&bonepose[i], &bonepose[model->data_bones[i].parent], &skeleton->relativetransforms[i]);
41                         else
42                                 memcpy(&bonepose[i], &skeleton->relativetransforms[i], sizeof(matrix4x4_t));
43
44                         // create a relative deformation matrix to describe displacement
45                         // from the base mesh, which is used by the actual weighting
46                         Matrix4x4_FromArray12FloatD3D(&mm, model->data_baseboneposeinverse + i * 12); // baseboneposeinverse is 4x3 row-major
47                         Matrix4x4_Concat(&mm2, &bonepose[i], &mm);
48                         Matrix4x4_Transpose(&boneposerelative[i], &mm2); // TODO: Eliminate this transpose
49                 }
50         }
51         else
52         {
53         float originscale = -model->num_posescale;
54         for (i = 0;i < model->num_bones;i++)
55         {
56             const short * RESTRICT pose7s = model->data_poses7s + 7 * (frameblend[0].subframe * model->num_bones + i);
57             float lerp = frameblend[0].lerp,
58                 tx = pose7s[0], ty = pose7s[1], tz = pose7s[2],
59                 rx = pose7s[3] * lerp,
60                 ry = pose7s[4] * lerp,
61                 rz = pose7s[5] * lerp,
62                 rw = pose7s[6] * lerp,
63                 dx = tx*rw + ty*rz - tz*ry,
64                 dy = -tx*rz + ty*rw + tz*rx,
65                 dz = tx*ry - ty*rx + tz*rw,
66                 dw = -tx*rx - ty*ry - tz*rz,
67                 scale;
68             for (blends = 1;blends < MAX_FRAMEBLENDS && frameblend[blends].lerp > 0;blends++)
69             {
70                 const short * RESTRICT pose7s = model->data_poses7s + 7 * (frameblend[blends].subframe * model->num_bones + i);
71                 float lerp = frameblend[blends].lerp,
72                     tx = pose7s[0], ty = pose7s[1], tz = pose7s[2],
73                     qx = pose7s[3], qy = pose7s[4], qz = pose7s[5], qw = pose7s[6];
74                 if(rx*qx + ry*qy + rz*qz + rw*qw < 0) lerp = -lerp;
75                 qx *= lerp;
76                 qy *= lerp;
77                 qz *= lerp;
78                 qw *= lerp;
79                 rx += qx;
80                 ry += qy;
81                 rz += qz;
82                 rw += qw;
83                 dx += tx*qw + ty*qz - tz*qy;
84                 dy += -tx*qz + ty*qw + tz*qx;
85                 dz += tx*qy - ty*qx + tz*qw;
86                 dw += -tx*qx - ty*qy - tz*qz;
87             }
88             scale = 1.0f / (rx*rx + ry*ry + rz*rz + rw*rw);
89             m[0] = scale*(rw*rw + rx*rx - ry*ry - rz*rz);
90             m[1] = 2*scale*(rx*ry - rw*rz);
91             m[2] = 2*scale*(rx*rz + rw*ry);
92             m[3] = originscale*scale*(dw*rx - dx*rw + dy*rz - dz*ry);
93             m[4] = 2*scale*(rx*ry + rw*rz);
94             m[5] = scale*(rw*rw + ry*ry - rx*rx - rz*rz);
95             m[6] = 2*scale*(ry*rz - rw*rx);
96             m[7] = originscale*scale*(dw*ry - dx*rz - dy*rw + dz*rx);
97             m[8] = 2*scale*(rx*rz - rw*ry);
98             m[9] = 2*scale*(ry*rz + rw*rx);
99             m[10] = scale*(rw*rw + rz*rz - rx*rx - ry*ry);
100             m[11] = originscale*scale*(dw*rz + dx*ry - dy*rx - dz*rw);
101                         if (i == r_skeletal_debugbone.integer)
102                                 m[r_skeletal_debugbonecomponent.integer % 12] += r_skeletal_debugbonevalue.value;
103                         m[3] *= r_skeletal_debugtranslatex.value;
104                         m[7] *= r_skeletal_debugtranslatey.value;
105                         m[11] *= r_skeletal_debugtranslatez.value;
106                         Matrix4x4_FromArray12FloatD3D(&mm, m);
107                         if (model->data_bones[i].parent >= 0)
108                                 Matrix4x4_Concat(&bonepose[i], &bonepose[model->data_bones[i].parent], &mm);
109                         else
110                                 memcpy(&bonepose[i], &mm, sizeof(mm));
111                         // create a relative deformation matrix to describe displacement
112                         // from the base mesh, which is used by the actual weighting
113                         Matrix4x4_FromArray12FloatD3D(&mm, model->data_baseboneposeinverse + i * 12); // baseboneposeinverse is 4x3 row-major
114                         Matrix4x4_Concat(&mm2, &bonepose[i], &mm);
115                         Matrix4x4_Transpose(&boneposerelative[i], &mm2); // TODO: Eliminate this transpose
116                 }
117         }
118
119         // generate matrices for all blend combinations
120         weights = model->surfmesh.data_blendweights;
121         for (i = 0;i < model->surfmesh.num_blends;i++, weights++)
122         {
123                 float * RESTRICT b = &boneposerelative[model->num_bones + i].m[0][0];
124                 const float * RESTRICT m = &boneposerelative[weights->index[0]].m[0][0];
125                 float f = weights->influence[0] * (1.0f / 255.0f);
126                 __m128 fv = _mm_set_ps1(f);
127                 __m128 b0 = _mm_load_ps(m);
128                 __m128 b1 = _mm_load_ps(m+4);
129                 __m128 b2 = _mm_load_ps(m+8);
130                 __m128 b3 = _mm_load_ps(m+12);
131                 __m128 m0, m1, m2, m3;
132                 b0 = _mm_mul_ps(b0, fv);
133                 b1 = _mm_mul_ps(b1, fv);
134                 b2 = _mm_mul_ps(b2, fv);
135                 b3 = _mm_mul_ps(b3, fv);
136                 for (k = 1;k < 4 && weights->influence[k];k++)
137                 {
138                         m = &boneposerelative[weights->index[k]].m[0][0];
139                         f = weights->influence[k] * (1.0f / 255.0f);
140                         fv = _mm_set_ps1(f);
141                         m0 = _mm_load_ps(m);
142                         m1 = _mm_load_ps(m+4);
143                         m2 = _mm_load_ps(m+8);
144                         m3 = _mm_load_ps(m+12);
145                         m0 = _mm_mul_ps(m0, fv);
146                         m1 = _mm_mul_ps(m1, fv);
147                         m2 = _mm_mul_ps(m2, fv);
148                         m3 = _mm_mul_ps(m3, fv);
149                         b0 = _mm_add_ps(m0, b0);
150                         b1 = _mm_add_ps(m1, b1);
151                         b2 = _mm_add_ps(m2, b2);
152                         b3 = _mm_add_ps(m3, b3);
153                 }
154                 _mm_store_ps(b, b0);
155                 _mm_store_ps(b+4, b1);
156                 _mm_store_ps(b+8, b2);
157                 _mm_store_ps(b+12, b3);
158         }
159
160 #define LOAD_MATRIX_SCALAR() const float * RESTRICT m = &boneposerelative[*b].m[0][0]
161
162 #define LOAD_MATRIX3() \
163         const float * RESTRICT m = &boneposerelative[*b].m[0][0]; \
164         /* bonepose array is 16 byte aligned */ \
165         __m128 m1 = _mm_load_ps((m)); \
166         __m128 m2 = _mm_load_ps((m)+4); \
167         __m128 m3 = _mm_load_ps((m)+8);
168 #define LOAD_MATRIX4() \
169         const float * RESTRICT m = &boneposerelative[*b].m[0][0]; \
170         /* bonepose array is 16 byte aligned */ \
171         __m128 m1 = _mm_load_ps((m)); \
172         __m128 m2 = _mm_load_ps((m)+4); \
173         __m128 m3 = _mm_load_ps((m)+8); \
174         __m128 m4 = _mm_load_ps((m)+12)
175
176         /* Note that matrix is 4x4 and transposed compared to non-USE_SSE codepath */
177 #define TRANSFORM_POSITION_SCALAR(in, out) \
178         (out)[0] = ((in)[0] * m[0] + (in)[1] * m[4] + (in)[2] * m[ 8] + m[12]); \
179         (out)[1] = ((in)[0] * m[1] + (in)[1] * m[5] + (in)[2] * m[ 9] + m[13]); \
180         (out)[2] = ((in)[0] * m[2] + (in)[1] * m[6] + (in)[2] * m[10] + m[14]);
181 #define TRANSFORM_VECTOR_SCALAR(in, out) \
182         (out)[0] = ((in)[0] * m[0] + (in)[1] * m[4] + (in)[2] * m[ 8]); \
183         (out)[1] = ((in)[0] * m[1] + (in)[1] * m[5] + (in)[2] * m[ 9]); \
184         (out)[2] = ((in)[0] * m[2] + (in)[1] * m[6] + (in)[2] * m[10]);
185
186 #define TRANSFORM_POSITION(in, out) { \
187                 __m128 pin = _mm_loadu_ps(in); /* we ignore the value in the last element (x from the next vertex) */ \
188                 __m128 x = _mm_shuffle_ps(pin, pin, 0x0); \
189                 __m128 t1 = _mm_mul_ps(x, m1); \
190                 \
191                 /* y, + x */ \
192                 __m128 y = _mm_shuffle_ps(pin, pin, 0x55); \
193                 __m128 t2 = _mm_mul_ps(y, m2); \
194                 __m128 t3 = _mm_add_ps(t1, t2); \
195                 \
196                 /* z, + (y+x) */ \
197                 __m128 z = _mm_shuffle_ps(pin, pin, 0xaa); \
198                 __m128 t4 = _mm_mul_ps(z, m3); \
199                 __m128 t5 = _mm_add_ps(t3, t4); \
200                 \
201                 /* + m3 */ \
202                 __m128 pout = _mm_add_ps(t5, m4); \
203                 _mm_storeu_ps((out), pout); \
204         }
205
206 #define TRANSFORM_VECTOR(in, out) { \
207                 __m128 vin = _mm_loadu_ps(in); \
208                 \
209                 /* x */ \
210                 __m128 x = _mm_shuffle_ps(vin, vin, 0x0); \
211                 __m128 t1 = _mm_mul_ps(x, m1); \
212                 \
213                 /* y, + x */ \
214                 __m128 y = _mm_shuffle_ps(vin, vin, 0x55); \
215                 __m128 t2 = _mm_mul_ps(y, m2); \
216                 __m128 t3 = _mm_add_ps(t1, t2); \
217                 \
218                 /* nz, + (ny + nx) */ \
219                 __m128 z = _mm_shuffle_ps(vin, vin, 0xaa); \
220                 __m128 t4 = _mm_mul_ps(z, m3); \
221                 __m128 vout = _mm_add_ps(t3, t4); \
222                 _mm_storeu_ps((out), vout); \
223         }
224
225         // transform vertex attributes by blended matrices
226         if (vertex3f)
227         {
228                 const float * RESTRICT v = model->surfmesh.data_vertex3f;
229                 const unsigned short * RESTRICT b = model->surfmesh.blends;
230                 // special case common combinations of attributes to avoid repeated loading of matrices
231                 if (normal3f)
232                 {
233                         const float * RESTRICT n = model->surfmesh.data_normal3f;
234                         if (svector3f && tvector3f)
235                         {
236                                 const float * RESTRICT sv = model->surfmesh.data_svector3f;
237                                 const float * RESTRICT tv = model->surfmesh.data_tvector3f;
238
239                                 // Note that for SSE each iteration stores one element past end, so we break one vertex short
240                                 // and handle that with scalars in that case
241                                 for (i = 0; i < num_vertices_minus_one; i++, v += 3, n += 3, sv += 3, tv += 3, b++,
242                                                 vertex3f += 3, normal3f += 3, svector3f += 3, tvector3f += 3)
243                                 {
244                                         LOAD_MATRIX4();
245                                         TRANSFORM_POSITION(v, vertex3f);
246                                         TRANSFORM_VECTOR(n, normal3f);
247                                         TRANSFORM_VECTOR(sv, svector3f);
248                                         TRANSFORM_VECTOR(tv, tvector3f);
249                                 }
250
251                                 // Last vertex needs to be done with scalars to avoid reading/writing 1 word past end of arrays
252                                 {
253                                         LOAD_MATRIX_SCALAR();
254                                         TRANSFORM_POSITION_SCALAR(v, vertex3f);
255                                         TRANSFORM_VECTOR_SCALAR(n, normal3f);
256                                         TRANSFORM_VECTOR_SCALAR(sv, svector3f);
257                                         TRANSFORM_VECTOR_SCALAR(tv, tvector3f);
258                                 }
259                                 //printf("elapsed ticks: %llu\n", rdtsc() - ts); // XXX
260                                 return;
261                         }
262
263                         for (i = 0;i < num_vertices_minus_one; i++, v += 3, n += 3, b++, vertex3f += 3, normal3f += 3)
264                         {
265                                 LOAD_MATRIX4();
266                                 TRANSFORM_POSITION(v, vertex3f);
267                                 TRANSFORM_VECTOR(n, normal3f);
268                         }
269                         {
270                                 LOAD_MATRIX_SCALAR();
271                                 TRANSFORM_POSITION_SCALAR(v, vertex3f);
272                                 TRANSFORM_VECTOR_SCALAR(n, normal3f);
273                         }
274                 }
275                 else
276                 {
277                         for (i = 0;i < num_vertices_minus_one; i++, v += 3, b++, vertex3f += 3)
278                         {
279                                 LOAD_MATRIX4();
280                                 TRANSFORM_POSITION(v, vertex3f);
281                         }
282                         {
283                                 LOAD_MATRIX_SCALAR();
284                                 TRANSFORM_POSITION_SCALAR(v, vertex3f);
285                         }
286                 }
287         }
288
289         else if (normal3f)
290         {
291                 const float * RESTRICT n = model->surfmesh.data_normal3f;
292                 const unsigned short * RESTRICT b = model->surfmesh.blends;
293                 for (i = 0; i < num_vertices_minus_one; i++, n += 3, b++, normal3f += 3)
294                 {
295                         LOAD_MATRIX3();
296                         TRANSFORM_VECTOR(n, normal3f);
297                 }
298                 {
299                         LOAD_MATRIX_SCALAR();
300                         TRANSFORM_VECTOR_SCALAR(n, normal3f);
301                 }
302         }
303
304         if (svector3f)
305         {
306                 const float * RESTRICT sv = model->surfmesh.data_svector3f;
307                 const unsigned short * RESTRICT b = model->surfmesh.blends;
308                 for (i = 0; i < num_vertices_minus_one; i++, sv += 3, b++, svector3f += 3)
309                 {
310                         LOAD_MATRIX3();
311                         TRANSFORM_VECTOR(sv, svector3f);
312                 }
313                 {
314                         LOAD_MATRIX_SCALAR();
315                         TRANSFORM_VECTOR_SCALAR(sv, svector3f);
316                 }
317         }
318
319         if (tvector3f)
320         {
321                 const float * RESTRICT tv = model->surfmesh.data_tvector3f;
322                 const unsigned short * RESTRICT b = model->surfmesh.blends;
323                 for (i = 0; i < num_vertices_minus_one; i++, tv += 3, b++, tvector3f += 3)
324                 {
325                         LOAD_MATRIX3();
326                         TRANSFORM_VECTOR(tv, tvector3f);
327                 }
328                 {
329                         LOAD_MATRIX_SCALAR();
330                         TRANSFORM_VECTOR_SCALAR(tv, tvector3f);
331                 }
332         }
333
334 #undef LOAD_MATRIX3
335 #undef LOAD_MATRIX4
336 #undef TRANSFORM_POSITION
337 #undef TRANSFORM_VECTOR
338 #undef LOAD_MATRIX_SCALAR
339 #undef TRANSFORM_POSITION_SCALAR
340 #undef TRANSFORM_VECTOR_SCALAR
341 }
342
343 #endif