X-Git-Url: http://git.xonotic.org/?p=xonotic%2Fdarkplaces.git;a=blobdiff_plain;f=mod_skeletal_animatevertices_sse.c;h=0c2fa5d9b2b477e632f6606eed3f0377133f230a;hp=d6f71f1a90b40ac04b9d23bd18d9cbeb77380c50;hb=0207a10e1576d5ebe67730b5b98709383613f4df;hpb=8823e14b9482ce5779c09a48e9b81f397f94b55f diff --git a/mod_skeletal_animatevertices_sse.c b/mod_skeletal_animatevertices_sse.c index d6f71f1a..0c2fa5d9 100644 --- a/mod_skeletal_animatevertices_sse.c +++ b/mod_skeletal_animatevertices_sse.c @@ -8,21 +8,16 @@ #include -void Mod_Skeletal_AnimateVertices_SSE(const dp_model_t * RESTRICT model, const frameblend_t * RESTRICT frameblend, const skeleton_t *skeleton, float * RESTRICT vertex3f, float * RESTRICT normal3f, float * RESTRICT svector3f, float * RESTRICT tvector3f) +void Mod_Skeletal_AnimateVertices_SSE(const model_t * RESTRICT model, const frameblend_t * RESTRICT frameblend, const skeleton_t *skeleton, float * RESTRICT vertex3f, float * RESTRICT normal3f, float * RESTRICT svector3f, float * RESTRICT tvector3f) { // vertex weighted skeletal int i, k; int blends; matrix4x4_t *bonepose; matrix4x4_t *boneposerelative; - float m[12]; - matrix4x4_t mm, mm2; const blendweights_t * RESTRICT weights; int num_vertices_minus_one; - if (!model->surfmesh.num_vertices) - return; - num_vertices_minus_one = model->surfmesh.num_vertices - 1; //unsigned long long ts = rdtsc(); @@ -37,68 +32,190 @@ void Mod_Skeletal_AnimateVertices_SSE(const dp_model_t * RESTRICT model, const f { for (i = 0;i < model->num_bones;i++) { - // relativetransforms is in GL column-major order, which is what we need for SSE - // transposed style processing + const float * RESTRICT n = model->data_baseboneposeinverse + i * 12; + matrix4x4_t * RESTRICT s = &skeleton->relativetransforms[i]; + matrix4x4_t * RESTRICT b = &bonepose[i]; + matrix4x4_t * RESTRICT r = &boneposerelative[i]; + __m128 b0, b1, b2, b3, r0, r1, r2, r3, nr; if (model->data_bones[i].parent >= 0) - Matrix4x4_Concat(&bonepose[i], &bonepose[model->data_bones[i].parent], &skeleton->relativetransforms[i]); + { + const matrix4x4_t * RESTRICT p = &bonepose[model->data_bones[i].parent]; + __m128 s0 = _mm_loadu_ps(s->m[0]), s1 = _mm_loadu_ps(s->m[1]), s2 = _mm_loadu_ps(s->m[2]); +#ifdef OPENGLORIENTATION + __m128 s3 = _mm_loadu_ps(s->m[3]); +#define SKELETON_MATRIX(r, c) _mm_shuffle_ps(s##c, s##c, _MM_SHUFFLE(r, r, r, r)) +#else +#define SKELETON_MATRIX(r, c) _mm_shuffle_ps(s##r, s##r, _MM_SHUFFLE(c, c, c, c)) +#endif + __m128 pr = _mm_load_ps(p->m[0]); + b0 = _mm_mul_ps(pr, SKELETON_MATRIX(0, 0)); + b1 = _mm_mul_ps(pr, SKELETON_MATRIX(0, 1)); + b2 = _mm_mul_ps(pr, SKELETON_MATRIX(0, 2)); + b3 = _mm_mul_ps(pr, SKELETON_MATRIX(0, 3)); + pr = _mm_load_ps(p->m[1]); + b0 = _mm_add_ps(b0, _mm_mul_ps(pr, SKELETON_MATRIX(1, 0))); + b1 = _mm_add_ps(b1, _mm_mul_ps(pr, SKELETON_MATRIX(1, 1))); + b2 = _mm_add_ps(b2, _mm_mul_ps(pr, SKELETON_MATRIX(1, 2))); + b3 = _mm_add_ps(b3, _mm_mul_ps(pr, SKELETON_MATRIX(1, 3))); + pr = _mm_load_ps(p->m[2]); + b0 = _mm_add_ps(b0, _mm_mul_ps(pr, SKELETON_MATRIX(2, 0))); + b1 = _mm_add_ps(b1, _mm_mul_ps(pr, SKELETON_MATRIX(2, 1))); + b2 = _mm_add_ps(b2, _mm_mul_ps(pr, SKELETON_MATRIX(2, 2))); + b3 = _mm_add_ps(b3, _mm_mul_ps(pr, SKELETON_MATRIX(2, 3))); + b3 = _mm_add_ps(b3, _mm_load_ps(p->m[3])); + } else - memcpy(&bonepose[i], &skeleton->relativetransforms[i], sizeof(matrix4x4_t)); - - // create a relative deformation matrix to describe displacement - // from the base mesh, which is used by the actual weighting - Matrix4x4_FromArray12FloatD3D(&mm, model->data_baseboneposeinverse + i * 12); // baseboneposeinverse is 4x3 row-major - Matrix4x4_Concat(&boneposerelative[i], &bonepose[i], &mm); + { + b0 = _mm_loadu_ps(s->m[0]); + b1 = _mm_loadu_ps(s->m[1]); + b2 = _mm_loadu_ps(s->m[2]); + b3 = _mm_loadu_ps(s->m[3]); +#ifndef OPENGLORIENTATION + _MM_TRANSPOSE4_PS(b0, b1, b2, b3); +#endif + } + _mm_store_ps(b->m[0], b0); + _mm_store_ps(b->m[1], b1); + _mm_store_ps(b->m[2], b2); + _mm_store_ps(b->m[3], b3); + nr = _mm_loadu_ps(n); + r0 = _mm_mul_ps(b0, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(0, 0, 0, 0))); + r1 = _mm_mul_ps(b0, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(1, 1, 1, 1))); + r2 = _mm_mul_ps(b0, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(2, 2, 2, 2))); + r3 = _mm_mul_ps(b0, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(3, 3, 3, 3))); + nr = _mm_loadu_ps(n+4); + r0 = _mm_add_ps(r0, _mm_mul_ps(b1, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(0, 0, 0, 0)))); + r1 = _mm_add_ps(r1, _mm_mul_ps(b1, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(1, 1, 1, 1)))); + r2 = _mm_add_ps(r2, _mm_mul_ps(b1, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(2, 2, 2, 2)))); + r3 = _mm_add_ps(r3, _mm_mul_ps(b1, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(3, 3, 3, 3)))); + nr = _mm_loadu_ps(n+8); + r0 = _mm_add_ps(r0, _mm_mul_ps(b2, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(0, 0, 0, 0)))); + r1 = _mm_add_ps(r1, _mm_mul_ps(b2, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(1, 1, 1, 1)))); + r2 = _mm_add_ps(r2, _mm_mul_ps(b2, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(2, 2, 2, 2)))); + r3 = _mm_add_ps(r3, _mm_mul_ps(b2, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(3, 3, 3, 3)))); + r3 = _mm_add_ps(r3, b3); + _mm_store_ps(r->m[0], r0); + _mm_store_ps(r->m[1], r1); + _mm_store_ps(r->m[2], r2); + _mm_store_ps(r->m[3], r3); } } else { - float originscale = model->num_posescale; - float x,y,z,w,lerp; - const short * RESTRICT pose6s; - for (i = 0;i < model->num_bones;i++) { - memset(m, 0, sizeof(m)); - for (blends = 0;blends < MAX_FRAMEBLENDS && frameblend[blends].lerp > 0;blends++) + float m[12]; + const short * RESTRICT firstpose7s = model->data_poses7s + 7 * (frameblend[0].subframe * model->num_bones + i); + float firstlerp = frameblend[0].lerp, + firsttx = firstpose7s[0], firstty = firstpose7s[1], firsttz = firstpose7s[2], + rx = firstpose7s[3] * firstlerp, + ry = firstpose7s[4] * firstlerp, + rz = firstpose7s[5] * firstlerp, + rw = firstpose7s[6] * firstlerp, + dx = firsttx*rw + firstty*rz - firsttz*ry, + dy = -firsttx*rz + firstty*rw + firsttz*rx, + dz = firsttx*ry - firstty*rx + firsttz*rw, + dw = -firsttx*rx - firstty*ry - firsttz*rz, + scale, sx, sy, sz, sw; + for (blends = 1;blends < MAX_FRAMEBLENDS && frameblend[blends].lerp > 0;blends++) { - pose6s = model->data_poses6s + 6 * (frameblend[blends].subframe * model->num_bones + i); - lerp = frameblend[blends].lerp; - x = pose6s[3] * (1.0f / 32767.0f); - y = pose6s[4] * (1.0f / 32767.0f); - z = pose6s[5] * (1.0f / 32767.0f); - w = 1.0f - (x*x+y*y+z*z); - w = w > 0.0f ? -sqrt(w) : 0.0f; - m[ 0] += (1-2*(y*y+z*z)) * lerp; - m[ 1] += ( 2*(x*y-z*w)) * lerp; - m[ 2] += ( 2*(x*z+y*w)) * lerp; - m[ 3] += (pose6s[0] * originscale) * lerp; - m[ 4] += ( 2*(x*y+z*w)) * lerp; - m[ 5] += (1-2*(x*x+z*z)) * lerp; - m[ 6] += ( 2*(y*z-x*w)) * lerp; - m[ 7] += (pose6s[1] * originscale) * lerp; - m[ 8] += ( 2*(x*z-y*w)) * lerp; - m[ 9] += ( 2*(y*z+x*w)) * lerp; - m[10] += (1-2*(x*x+y*y)) * lerp; - m[11] += (pose6s[2] * originscale) * lerp; + const short * RESTRICT blendpose7s = model->data_poses7s + 7 * (frameblend[blends].subframe * model->num_bones + i); + float blendlerp = frameblend[blends].lerp, + blendtx = blendpose7s[0], blendty = blendpose7s[1], blendtz = blendpose7s[2], + qx = blendpose7s[3], qy = blendpose7s[4], qz = blendpose7s[5], qw = blendpose7s[6]; + if(rx*qx + ry*qy + rz*qz + rw*qw < 0) blendlerp = -blendlerp; + qx *= blendlerp; + qy *= blendlerp; + qz *= blendlerp; + qw *= blendlerp; + rx += qx; + ry += qy; + rz += qz; + rw += qw; + dx += blendtx*qw + blendty*qz - blendtz*qy; + dy += -blendtx*qz + blendty*qw + blendtz*qx; + dz += blendtx*qy - blendty*qx + blendtz*qw; + dw += -blendtx*qx - blendty*qy - blendtz*qz; } - VectorNormalize(m ); - VectorNormalize(m + 4); - VectorNormalize(m + 8); + scale = 1.0f / (rx*rx + ry*ry + rz*rz + rw*rw); + sx = rx * scale; + sy = ry * scale; + sz = rz * scale; + sw = rw * scale; + m[0] = sw*rw + sx*rx - sy*ry - sz*rz; + m[1] = 2*(sx*ry - sw*rz); + m[2] = 2*(sx*rz + sw*ry); + m[3] = model->num_posescale*(dx*sw - dy*sz + dz*sy - dw*sx); + m[4] = 2*(sx*ry + sw*rz); + m[5] = sw*rw + sy*ry - sx*rx - sz*rz; + m[6] = 2*(sy*rz - sw*rx); + m[7] = model->num_posescale*(dx*sz + dy*sw - dz*sx - dw*sy); + m[8] = 2*(sx*rz - sw*ry); + m[9] = 2*(sy*rz + sw*rx); + m[10] = sw*rw + sz*rz - sx*rx - sy*ry; + m[11] = model->num_posescale*(dy*sx + dz*sw - dx*sy - dw*sz); if (i == r_skeletal_debugbone.integer) m[r_skeletal_debugbonecomponent.integer % 12] += r_skeletal_debugbonevalue.value; m[3] *= r_skeletal_debugtranslatex.value; m[7] *= r_skeletal_debugtranslatey.value; m[11] *= r_skeletal_debugtranslatez.value; - Matrix4x4_FromArray12FloatD3D(&mm, m); - if (model->data_bones[i].parent >= 0) - Matrix4x4_Concat(&bonepose[i], &bonepose[model->data_bones[i].parent], &mm); - else - memcpy(&bonepose[i], &mm, sizeof(mm)); - // create a relative deformation matrix to describe displacement - // from the base mesh, which is used by the actual weighting - Matrix4x4_FromArray12FloatD3D(&mm, model->data_baseboneposeinverse + i * 12); // baseboneposeinverse is 4x3 row-major - Matrix4x4_Concat(&mm2, &bonepose[i], &mm); - Matrix4x4_Transpose(&boneposerelative[i], &mm2); // TODO: Eliminate this transpose + { + const float * RESTRICT n = model->data_baseboneposeinverse + i * 12; + matrix4x4_t * RESTRICT b = &bonepose[i]; + matrix4x4_t * RESTRICT r = &boneposerelative[i]; + __m128 b0, b1, b2, b3, r0, r1, r2, r3, nr; + if (model->data_bones[i].parent >= 0) + { + const matrix4x4_t * RESTRICT p = &bonepose[model->data_bones[i].parent]; + __m128 pr = _mm_load_ps(p->m[0]); + b0 = _mm_mul_ps(pr, _mm_set1_ps(m[0])); + b1 = _mm_mul_ps(pr, _mm_set1_ps(m[1])); + b2 = _mm_mul_ps(pr, _mm_set1_ps(m[2])); + b3 = _mm_mul_ps(pr, _mm_set1_ps(m[3])); + pr = _mm_load_ps(p->m[1]); + b0 = _mm_add_ps(b0, _mm_mul_ps(pr, _mm_set1_ps(m[4]))); + b1 = _mm_add_ps(b1, _mm_mul_ps(pr, _mm_set1_ps(m[5]))); + b2 = _mm_add_ps(b2, _mm_mul_ps(pr, _mm_set1_ps(m[6]))); + b3 = _mm_add_ps(b3, _mm_mul_ps(pr, _mm_set1_ps(m[7]))); + pr = _mm_load_ps(p->m[2]); + b0 = _mm_add_ps(b0, _mm_mul_ps(pr, _mm_set1_ps(m[8]))); + b1 = _mm_add_ps(b1, _mm_mul_ps(pr, _mm_set1_ps(m[9]))); + b2 = _mm_add_ps(b2, _mm_mul_ps(pr, _mm_set1_ps(m[10]))); + b3 = _mm_add_ps(b3, _mm_mul_ps(pr, _mm_set1_ps(m[11]))); + b3 = _mm_add_ps(b3, _mm_load_ps(p->m[3])); + } + else + { + b0 = _mm_setr_ps(m[0], m[4], m[8], 0.0f); + b1 = _mm_setr_ps(m[1], m[5], m[9], 0.0f); + b2 = _mm_setr_ps(m[2], m[6], m[10], 0.0f); + b3 = _mm_setr_ps(m[3], m[7], m[11], 1.0f); + } + _mm_store_ps(b->m[0], b0); + _mm_store_ps(b->m[1], b1); + _mm_store_ps(b->m[2], b2); + _mm_store_ps(b->m[3], b3); + nr = _mm_loadu_ps(n); + r0 = _mm_mul_ps(b0, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(0, 0, 0, 0))); + r1 = _mm_mul_ps(b0, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(1, 1, 1, 1))); + r2 = _mm_mul_ps(b0, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(2, 2, 2, 2))); + r3 = _mm_mul_ps(b0, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(3, 3, 3, 3))); + nr = _mm_loadu_ps(n+4); + r0 = _mm_add_ps(r0, _mm_mul_ps(b1, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(0, 0, 0, 0)))); + r1 = _mm_add_ps(r1, _mm_mul_ps(b1, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(1, 1, 1, 1)))); + r2 = _mm_add_ps(r2, _mm_mul_ps(b1, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(2, 2, 2, 2)))); + r3 = _mm_add_ps(r3, _mm_mul_ps(b1, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(3, 3, 3, 3)))); + nr = _mm_loadu_ps(n+8); + r0 = _mm_add_ps(r0, _mm_mul_ps(b2, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(0, 0, 0, 0)))); + r1 = _mm_add_ps(r1, _mm_mul_ps(b2, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(1, 1, 1, 1)))); + r2 = _mm_add_ps(r2, _mm_mul_ps(b2, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(2, 2, 2, 2)))); + r3 = _mm_add_ps(r3, _mm_mul_ps(b2, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(3, 3, 3, 3)))); + r3 = _mm_add_ps(r3, b3); + _mm_store_ps(r->m[0], r0); + _mm_store_ps(r->m[1], r1); + _mm_store_ps(r->m[2], r2); + _mm_store_ps(r->m[3], r3); + } } } @@ -219,19 +336,19 @@ void Mod_Skeletal_AnimateVertices_SSE(const dp_model_t * RESTRICT model, const f const float * RESTRICT n = model->surfmesh.data_normal3f; if (svector3f && tvector3f) { - const float * RESTRICT sv = model->surfmesh.data_svector3f; - const float * RESTRICT tv = model->surfmesh.data_tvector3f; + const float * RESTRICT svec = model->surfmesh.data_svector3f; + const float * RESTRICT tvec = model->surfmesh.data_tvector3f; // Note that for SSE each iteration stores one element past end, so we break one vertex short // and handle that with scalars in that case - for (i = 0; i < num_vertices_minus_one; i++, v += 3, n += 3, sv += 3, tv += 3, b++, + for (i = 0; i < num_vertices_minus_one; i++, v += 3, n += 3, svec += 3, tvec += 3, b++, vertex3f += 3, normal3f += 3, svector3f += 3, tvector3f += 3) { LOAD_MATRIX4(); TRANSFORM_POSITION(v, vertex3f); TRANSFORM_VECTOR(n, normal3f); - TRANSFORM_VECTOR(sv, svector3f); - TRANSFORM_VECTOR(tv, tvector3f); + TRANSFORM_VECTOR(svec, svector3f); + TRANSFORM_VECTOR(tvec, tvector3f); } // Last vertex needs to be done with scalars to avoid reading/writing 1 word past end of arrays @@ -239,8 +356,8 @@ void Mod_Skeletal_AnimateVertices_SSE(const dp_model_t * RESTRICT model, const f LOAD_MATRIX_SCALAR(); TRANSFORM_POSITION_SCALAR(v, vertex3f); TRANSFORM_VECTOR_SCALAR(n, normal3f); - TRANSFORM_VECTOR_SCALAR(sv, svector3f); - TRANSFORM_VECTOR_SCALAR(tv, tvector3f); + TRANSFORM_VECTOR_SCALAR(svec, svector3f); + TRANSFORM_VECTOR_SCALAR(tvec, tvector3f); } //printf("elapsed ticks: %llu\n", rdtsc() - ts); // XXX return; @@ -289,31 +406,31 @@ void Mod_Skeletal_AnimateVertices_SSE(const dp_model_t * RESTRICT model, const f if (svector3f) { - const float * RESTRICT sv = model->surfmesh.data_svector3f; + const float * RESTRICT svec = model->surfmesh.data_svector3f; const unsigned short * RESTRICT b = model->surfmesh.blends; - for (i = 0; i < num_vertices_minus_one; i++, sv += 3, b++, svector3f += 3) + for (i = 0; i < num_vertices_minus_one; i++, svec += 3, b++, svector3f += 3) { LOAD_MATRIX3(); - TRANSFORM_VECTOR(sv, svector3f); + TRANSFORM_VECTOR(svec, svector3f); } { LOAD_MATRIX_SCALAR(); - TRANSFORM_VECTOR_SCALAR(sv, svector3f); + TRANSFORM_VECTOR_SCALAR(svec, svector3f); } } if (tvector3f) { - const float * RESTRICT tv = model->surfmesh.data_tvector3f; + const float * RESTRICT tvec = model->surfmesh.data_tvector3f; const unsigned short * RESTRICT b = model->surfmesh.blends; - for (i = 0; i < num_vertices_minus_one; i++, tv += 3, b++, tvector3f += 3) + for (i = 0; i < num_vertices_minus_one; i++, tvec += 3, b++, tvector3f += 3) { LOAD_MATRIX3(); - TRANSFORM_VECTOR(tv, tvector3f); + TRANSFORM_VECTOR(tvec, tvector3f); } { LOAD_MATRIX_SCALAR(); - TRANSFORM_VECTOR_SCALAR(tv, tvector3f); + TRANSFORM_VECTOR_SCALAR(tvec, tvector3f); } }