+#define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
+{ \
+ __m128 p = (in); \
+ out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
+ _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
+ _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
+ _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
+}
+
+static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
+{
+ int clipmask = 0xFF;
+ __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
+ __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
+ __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
+ __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
+ m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
+ m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
+ m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
+ m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
+ #define BBFRONT(k, pos) \
+ { \
+ DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
+ clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
+ if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
+ { \
+ __m128 proj; \
+ clipmask &= ~(1<<k); \
+ proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
+ minproj = _mm_min_ss(minproj, proj); \
+ maxproj = _mm_max_ss(maxproj, proj); \
+ } \
+ }
+ BBFRONT(0, minpos);
+ BBFRONT(1, _mm_move_ss(minpos, maxpos));
+ BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0)));
+ BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0)));
+ BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
+ BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
+ BBFRONT(6, _mm_move_ss(maxpos, minpos));
+ BBFRONT(7, maxpos);
+ #define BBCLIP(k) \
+ { \
+ if (clipmask&(1<<k)) \
+ { \
+ if (!(clipmask&(1<<(k^1)))) \
+ { \
+ __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
+ __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
+ proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
+ minproj = _mm_min_ss(minproj, proj); \
+ maxproj = _mm_max_ss(maxproj, proj); \
+ } \
+ if (!(clipmask&(1<<(k^2)))) \
+ { \
+ __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
+ __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
+ proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
+ minproj = _mm_min_ss(minproj, proj); \
+ maxproj = _mm_max_ss(maxproj, proj); \
+ } \
+ if (!(clipmask&(1<<(k^4)))) \
+ { \
+ __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
+ __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
+ proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
+ minproj = _mm_min_ss(minproj, proj); \
+ maxproj = _mm_max_ss(maxproj, proj); \
+ } \
+ } \
+ }
+ BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
+ viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
+ viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
+ minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
+ maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
+ minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
+ maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
+ *starty = _mm_cvttss_si32(maxproj);
+ *endy = _mm_cvttss_si32(minproj)+1;
+ return clipmask;
+}
+
+static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)