- __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
- __m128i mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x])));
- mod = _mm_packs_epi32(mod, mod);
- pix = _mm_mulhi_epu16(pix, mod);
- *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
+ int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
+ __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
+ if (nextsub >= endx)
+ {
+ nextsub = endsub = endx-1;
+ if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
+ }
+ mod = endmod;
+ submod = endsubmod;
+ endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
+ substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
+ endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
+ submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
+ substep = _mm_packs_epi32(substep, substep);
+ for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
+ {
+ __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
+ pix = _mm_mulhi_epu16(pix, submod);
+ _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
+ }
+ if (x <= endsub)
+ {
+ __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
+ pix = _mm_mulhi_epu16(pix, submod);
+ *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
+ x++;
+ }