+ {
+ const float * RESTRICT n = model->data_baseboneposeinverse + i * 12;
+ matrix4x4_t * RESTRICT b = &bonepose[i];
+ matrix4x4_t * RESTRICT r = &boneposerelative[i];
+ __m128 b0, b1, b2, b3, r0, r1, r2, r3, nr;
+ if (model->data_bones[i].parent >= 0)
+ {
+ const matrix4x4_t * RESTRICT p = &bonepose[model->data_bones[i].parent];
+ __m128 pr = _mm_load_ps(p->m[0]);
+ b0 = _mm_mul_ps(pr, _mm_set1_ps(m[0]));
+ b1 = _mm_mul_ps(pr, _mm_set1_ps(m[1]));
+ b2 = _mm_mul_ps(pr, _mm_set1_ps(m[2]));
+ b3 = _mm_mul_ps(pr, _mm_set1_ps(m[3]));
+ pr = _mm_load_ps(p->m[1]);
+ b0 = _mm_add_ps(b0, _mm_mul_ps(pr, _mm_set1_ps(m[4])));
+ b1 = _mm_add_ps(b1, _mm_mul_ps(pr, _mm_set1_ps(m[5])));
+ b2 = _mm_add_ps(b2, _mm_mul_ps(pr, _mm_set1_ps(m[6])));
+ b3 = _mm_add_ps(b3, _mm_mul_ps(pr, _mm_set1_ps(m[7])));
+ pr = _mm_load_ps(p->m[2]);
+ b0 = _mm_add_ps(b0, _mm_mul_ps(pr, _mm_set1_ps(m[8])));
+ b1 = _mm_add_ps(b1, _mm_mul_ps(pr, _mm_set1_ps(m[9])));
+ b2 = _mm_add_ps(b2, _mm_mul_ps(pr, _mm_set1_ps(m[10])));
+ b3 = _mm_add_ps(b3, _mm_mul_ps(pr, _mm_set1_ps(m[11])));
+ b3 = _mm_add_ps(b3, _mm_load_ps(p->m[3]));
+ }
+ else
+ {
+ b0 = _mm_setr_ps(m[0], m[4], m[8], 0.0f);
+ b1 = _mm_setr_ps(m[1], m[5], m[9], 0.0f);
+ b2 = _mm_setr_ps(m[2], m[6], m[10], 0.0f);
+ b3 = _mm_setr_ps(m[3], m[7], m[11], 1.0f);
+ }
+ _mm_store_ps(b->m[0], b0);
+ _mm_store_ps(b->m[1], b1);
+ _mm_store_ps(b->m[2], b2);
+ _mm_store_ps(b->m[3], b3);
+ nr = _mm_loadu_ps(n);
+ r0 = _mm_mul_ps(b0, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(0, 0, 0, 0)));
+ r1 = _mm_mul_ps(b0, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(1, 1, 1, 1)));
+ r2 = _mm_mul_ps(b0, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(2, 2, 2, 2)));
+ r3 = _mm_mul_ps(b0, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(3, 3, 3, 3)));
+ nr = _mm_loadu_ps(n+4);
+ r0 = _mm_add_ps(r0, _mm_mul_ps(b1, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(0, 0, 0, 0))));
+ r1 = _mm_add_ps(r1, _mm_mul_ps(b1, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(1, 1, 1, 1))));
+ r2 = _mm_add_ps(r2, _mm_mul_ps(b1, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(2, 2, 2, 2))));
+ r3 = _mm_add_ps(r3, _mm_mul_ps(b1, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(3, 3, 3, 3))));
+ nr = _mm_loadu_ps(n+8);
+ r0 = _mm_add_ps(r0, _mm_mul_ps(b2, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(0, 0, 0, 0))));
+ r1 = _mm_add_ps(r1, _mm_mul_ps(b2, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(1, 1, 1, 1))));
+ r2 = _mm_add_ps(r2, _mm_mul_ps(b2, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(2, 2, 2, 2))));
+ r3 = _mm_add_ps(r3, _mm_mul_ps(b2, _mm_shuffle_ps(nr, nr, _MM_SHUFFLE(3, 3, 3, 3))));
+ r3 = _mm_add_ps(r3, b3);
+ _mm_store_ps(r->m[0], r0);
+ _mm_store_ps(r->m[1], r1);
+ _mm_store_ps(r->m[2], r2);
+ _mm_store_ps(r->m[3], r3);
+ }