+ float *end = dst + size*4;
+ if (stride == sizeof(float[3]))
+ {
+ float *end4 = dst + (size&~3)*4;
+ if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
+ {
+ while (dst < end4)
+ {
+ __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv;
+ dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
+ dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
+ _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
+ dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
+ dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
+ _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
+ dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
+ dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
+ dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
+ _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
+ dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
+ _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
+ dst += 16;
+ src += 4*sizeof(float[3]);
+ }
+ }
+ else
+ {
+ while (dst < end4)
+ {
+ __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
+ dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
+ dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
+ _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
+ dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
+ dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
+ _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
+ dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
+ dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
+ dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
+ _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
+ dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
+ _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
+ dst += 16;
+ src += 4*sizeof(float[3]);
+ }
+ }
+ }
+ if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
+ {
+ while (dst < end)
+ {
+ __m128 v = _mm_loadu_ps((const float *)src);
+ v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
+ v = _mm_move_ss(v, _mm_set_ss(1.0f));
+ v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
+ _mm_store_ps(dst, v);
+ dst += 4;
+ src += stride;
+ }
+ }
+ else
+ {
+ while (dst < end)
+ {
+ __m128 v = _mm_load_ps((const float *)src);
+ v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
+ v = _mm_move_ss(v, _mm_set_ss(1.0f));
+ v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
+ _mm_store_ps(dst, v);
+ dst += 4;
+ src += stride;
+ }
+ }
+}
+
+static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
+{
+ float *end = dst + size*4;
+ __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
+ if (stride == sizeof(float[2]))
+ {
+ float *end2 = dst + (size&~1)*4;
+ if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
+ {
+ while (dst < end2)
+ {
+ __m128 v = _mm_loadu_ps((const float *)src);
+ _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
+ _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
+ dst += 8;
+ src += 2*sizeof(float[2]);
+ }
+ }
+ else
+ {
+ while (dst < end2)
+ {
+ __m128 v = _mm_load_ps((const float *)src);
+ _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
+ _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
+ dst += 8;
+ src += 2*sizeof(float[2]);
+ }
+ }
+ }
+ while (dst < end)
+ {
+ _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
+ dst += 4;
+ src += stride;
+ }