typedef qboolean bool;
#endif
-#define GL_NONE 0
-#define GL_FRONT_LEFT 0x0400
-#define GL_FRONT_RIGHT 0x0401
-#define GL_BACK_LEFT 0x0402
-#define GL_BACK_RIGHT 0x0403
-#define GL_FRONT 0x0404
-#define GL_BACK 0x0405
-#define GL_LEFT 0x0406
-#define GL_RIGHT 0x0407
-#define GL_FRONT_AND_BACK 0x0408
-#define GL_AUX0 0x0409
-#define GL_AUX1 0x040A
-#define GL_AUX2 0x040B
-#define GL_AUX3 0x040C
-
-#define GL_NEVER 0x0200
-#define GL_LESS 0x0201
-#define GL_EQUAL 0x0202
-#define GL_LEQUAL 0x0203
-#define GL_GREATER 0x0204
-#define GL_NOTEQUAL 0x0205
-#define GL_GEQUAL 0x0206
-#define GL_ALWAYS 0x0207
-
-#define GL_ZERO 0x0
-#define GL_ONE 0x1
-#define GL_SRC_COLOR 0x0300
-#define GL_ONE_MINUS_SRC_COLOR 0x0301
-#define GL_DST_COLOR 0x0306
-#define GL_ONE_MINUS_DST_COLOR 0x0307
-#define GL_SRC_ALPHA 0x0302
-#define GL_ONE_MINUS_SRC_ALPHA 0x0303
-#define GL_DST_ALPHA 0x0304
-#define GL_ONE_MINUS_DST_ALPHA 0x0305
-#define GL_SRC_ALPHA_SATURATE 0x0308
-#define GL_CONSTANT_COLOR 0x8001
-#define GL_ONE_MINUS_CONSTANT_COLOR 0x8002
-#define GL_CONSTANT_ALPHA 0x8003
-#define GL_ONE_MINUS_CONSTANT_ALPHA 0x8004
+#if defined(__GNUC__)
+#define ALIGN(var) var __attribute__((__aligned__(16)))
+#elif defined(_MSC_VER)
+#define ALIGN(var) __declspec(align(16)) var
+#else
+#define ALIGN(var) var
+#endif
+
+#ifdef SSE2_PRESENT
+#include <emmintrin.h>
+
+#define MM_MALLOC(size) _mm_malloc(size, 16)
+
+static void *MM_CALLOC(size_t nmemb, size_t size)
+{
+ void *ptr = _mm_malloc(nmemb*size, 16);
+ if(ptr != NULL) memset(ptr, 0, nmemb*size);
+ return ptr;
+}
+
+#define MM_FREE _mm_free
+#else
+#define MM_MALLOC(size) malloc(size)
+#define MM_CALLOC(nmemb, size) calloc(nmemb, size)
+#define MM_FREE free
+#endif
typedef enum DPSOFTRAST_ARRAY_e
{
#define DPSOFTRAST_MAXSUBSPAN 16
-typedef struct DPSOFTRAST_State_Draw_Span_s
+typedef ALIGN(struct DPSOFTRAST_State_Draw_Span_s
{
int start; // pixel index
int length; // pixel count
// [0][DPSOFTRAST_ARRAY_TOTAL][] is start screencoord4f
// [1][DPSOFTRAST_ARRAY_TOTAL][] is end screencoord4f
// NOTE: screencoord4f[3] is W (basically 1/Z), useful for depthbuffer
- float data[2][DPSOFTRAST_ARRAY_TOTAL+1][4];
+ ALIGN(float data[2][DPSOFTRAST_ARRAY_TOTAL+1][4]);
}
-DPSOFTRAST_State_Draw_Span;
+DPSOFTRAST_State_Draw_Span);
#define DPSOFTRAST_DRAW_MAXSPANQUEUE 1024
}
DPSOFTRAST_BLENDMODE;
-typedef struct DPSOFTRAST_State_s
+typedef ALIGN(struct DPSOFTRAST_State_s
{
// DPSOFTRAST_VALIDATE_ flags
int validate;
int shader_mode;
int shader_permutation;
- float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4];
+ ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
// derived values (DPSOFTRAST_VALIDATE_FB)
int fb_clearscissor[4];
int fb_viewport[4];
int fb_viewportscissor[4];
- float fb_viewportcenter[2];
- float fb_viewportscale[2];
+ ALIGN(float fb_viewportcenter[4]);
+ ALIGN(float fb_viewportscale[4]);
// derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
int fb_depthfunc;
DPSOFTRAST_State_Draw draw;
}
-DPSOFTRAST_State;
+DPSOFTRAST_State);
DPSOFTRAST_State dpsoftrast;
extern int dpsoftrast_test;
#define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
+#define DPSOFTRAST_DEPTHOFFSET (128.0f)
#define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
#define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
#define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
dpsoftrast.fb_viewportscissor[1] = y5;
dpsoftrast.fb_viewportscissor[2] = x6 - x5;
dpsoftrast.fb_viewportscissor[3] = y6 - y5;
- dpsoftrast.fb_viewportcenter[0] = dpsoftrast.user.viewport[0] + 0.5f * dpsoftrast.user.viewport[2] - 0.5f;
- dpsoftrast.fb_viewportcenter[1] = dpsoftrast.fb_height - dpsoftrast.user.viewport[1] - 0.5f * dpsoftrast.user.viewport[3] - 0.5f;
- dpsoftrast.fb_viewportscale[0] = 0.5f * dpsoftrast.user.viewport[2];
- dpsoftrast.fb_viewportscale[1] = -0.5f * dpsoftrast.user.viewport[3];
+ dpsoftrast.fb_viewportcenter[1] = dpsoftrast.user.viewport[0] + 0.5f * dpsoftrast.user.viewport[2] - 0.5f;
+ dpsoftrast.fb_viewportcenter[2] = dpsoftrast.fb_height - dpsoftrast.user.viewport[1] - 0.5f * dpsoftrast.user.viewport[3] - 0.5f;
+ dpsoftrast.fb_viewportcenter[3] = 0.5f;
+ dpsoftrast.fb_viewportcenter[0] = 0.0f;
+ dpsoftrast.fb_viewportscale[1] = 0.5f * dpsoftrast.user.viewport[2];
+ dpsoftrast.fb_viewportscale[2] = -0.5f * dpsoftrast.user.viewport[3];
+ dpsoftrast.fb_viewportscale[3] = 0.5f;
+ dpsoftrast.fb_viewportscale[0] = 1.0f;
}
void DPSOFTRAST_RecalcDepthFunc(void)
texture->size = size;
// allocate the pixels now
- texture->bytes = (unsigned char *)calloc(1, size);
+ texture->bytes = (unsigned char *)MM_CALLOC(1, size);
return texnum;
}
DPSOFTRAST_Texture *texture;
texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
if (texture->bytes)
- free(texture->bytes);
+ MM_FREE(texture->bytes);
texture->bytes = NULL;
memset(texture, 0, sizeof(*texture));
// adjust the free range and used range
while (dpsoftrast.draw.maxvertices < dpsoftrast.draw.numvertices)
dpsoftrast.draw.maxvertices *= 2;
if (dpsoftrast.draw.in_array4f[0])
- free(dpsoftrast.draw.in_array4f[0]);
- data = (float *)calloc(1, dpsoftrast.draw.maxvertices * sizeof(float[4])*(DPSOFTRAST_ARRAY_TOTAL*2 + 1));
+ MM_FREE(dpsoftrast.draw.in_array4f[0]);
+ data = (float *)MM_CALLOC(1, dpsoftrast.draw.maxvertices * sizeof(float[4])*(DPSOFTRAST_ARRAY_TOTAL*2 + 1));
for (i = 0;i < DPSOFTRAST_ARRAY_TOTAL;i++, data += dpsoftrast.draw.maxvertices * 4)
dpsoftrast.draw.in_array4f[i] = data;
for (i = 0;i < DPSOFTRAST_ARRAY_TOTAL;i++, data += dpsoftrast.draw.maxvertices * 4)
memcpy(out4f, in4f, numitems * sizeof(float[4]));
}
+#ifdef SSE2_PRESENT
+static __m128 DPSOFTRAST_Draw_ProjectVertex(__m128 v)
+{
+ __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
+ __m128 w = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3));
+ v = _mm_move_ss(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set1_ps(1.0f));
+ v = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, v), w));
+ v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
+ return v;
+}
+#endif
+
void DPSOFTRAST_Draw_ProjectVertices(float *out4f, const float *in4f, int numitems)
{
+#ifdef SSE2_PRESENT
// NOTE: this is used both as a whole mesh transform function and a
// per-triangle transform function (for clipped triangles), accordingly
// it should not crash on divide by 0 but the result of divide by 0 is
// unimportant...
// TODO: SIMD
int i;
- float w;
- float viewportcenter[4];
- float viewportscale[4];
- viewportscale[0] = dpsoftrast.fb_viewportscale[0];
- viewportscale[1] = dpsoftrast.fb_viewportscale[1];
- viewportscale[2] = 0.5f;
- viewportscale[3] = 0.0f;
- viewportcenter[0] = dpsoftrast.fb_viewportcenter[0];
- viewportcenter[1] = dpsoftrast.fb_viewportcenter[1];
- viewportcenter[2] = 0.5f;
- viewportcenter[3] = 0.0f;
+ __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
for (i = 0;i < numitems;i++)
{
- if (!in4f[3])
- {
- out4f[0] = 0.0f;
- out4f[1] = 0.0f;
- out4f[2] = 0.0f;
- out4f[3] = 0.0f;
- continue;
- }
- w = 1.0f / in4f[3];
- out4f[0] = viewportcenter[0] + viewportscale[0] * in4f[0] * w;
- out4f[1] = viewportcenter[1] + viewportscale[1] * in4f[1] * w;
- out4f[2] = viewportcenter[2] + viewportscale[2] * in4f[2] * w;
- out4f[3] = viewportcenter[3] + viewportscale[3] * in4f[3] * w;
- out4f[3] = w;
+ __m128 v = _mm_load_ps(in4f), w = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3));
+ v = _mm_move_ss(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set1_ps(1.0f));
+ v = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, v), w));
+ _mm_store_ps(out4f, _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1)));
in4f += 4;
out4f += 4;
}
+#endif
}
void DPSOFTRAST_Draw_DebugEdgePoints(const float *screen0, const float *screen1)
int endx = span->endx;
float w = span->data[0][DPSOFTRAST_ARRAY_TOTAL][3];
float wslope = span->data[1][DPSOFTRAST_ARRAY_TOTAL][3];
+ float endz = 1.0f / (w + wslope * startx);
for (x = startx;x < endx;)
{
- int endsub = x + DPSOFTRAST_MAXSUBSPAN-1;
- float z = 1.0f / (w + wslope * x), dz;
- if (endsub >= endx)
- {
- endsub = endx-1;
- dz = endsub > x ? (1.0f / (w + wslope * endsub) - z) / (endsub - x) : 0.0f;
- }
- else
- {
- dz = (1.0f / (w + wslope * endsub) - z) * (1.0f / (DPSOFTRAST_MAXSUBSPAN-1));
- }
+ int nextsub = x + DPSOFTRAST_MAXSUBSPAN, endsub = nextsub - 1;
+ float z = endz, dz;
+ if(nextsub >= endx) nextsub = endsub = endx-1;
+ endz = 1.0f / (w + wslope * nextsub);
+ dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
for (; x <= endsub; x++, z += dz)
zf[x] = z;
}
void DPSOFTRAST_Draw_Span_FinishBGRA8(const DPSOFTRAST_State_Draw_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
{
+#ifdef SSE2_PRESENT
int x;
int startx = span->startx;
int endx = span->endx;
- int d[4];
const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
- int a, b;
unsigned char * RESTRICT pixelmask = span->pixelmask;
unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
switch(dpsoftrast.fb_blendmode)
{
case DPSOFTRAST_BLENDMODE_OPAQUE:
- for (x = startx;x < endx;x++)
+ for (x = startx;x + 4 <= endx;)
+ {
+ if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
+ {
+ _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
+ x += 4;
+ }
+ else
+ {
+ if (pixelmask[x])
+ pixeli[x] = ini[x];
+ x++;
+ }
+ }
+ for (;x < endx;x++)
if (pixelmask[x])
pixeli[x] = ini[x];
break;
case DPSOFTRAST_BLENDMODE_ALPHA:
- for (x = startx;x < endx;x++)
- {
- if (!pixelmask[x])
- continue;
- a = in4ub[x*4+3];
- b = 256 - in4ub[x*4+3];
- pixel[x*4+0] = (in4ub[x*4+0]*a+pixel[x*4+0]*b) >> 8;
- pixel[x*4+1] = (in4ub[x*4+1]*a+pixel[x*4+1]*b) >> 8;
- pixel[x*4+2] = (in4ub[x*4+2]*a+pixel[x*4+2]*b) >> 8;
- pixel[x*4+3] = (in4ub[x*4+3]*a+pixel[x*4+3]*b) >> 8;
- }
+ #define FINISHBLEND(blend2, blend1) \
+ for (x = startx;x + 2 <= endx;x += 2) \
+ { \
+ __m128i src, dst; \
+ switch (*(const unsigned short*)&pixelmask[x]) \
+ { \
+ case 0x0101: \
+ src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
+ dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
+ blend2; \
+ _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
+ continue; \
+ case 0x0100: \
+ src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
+ dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
+ blend1; \
+ pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
+ continue; \
+ case 0x0001: \
+ src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
+ dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
+ blend1; \
+ pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
+ continue; \
+ } \
+ break; \
+ } \
+ for(;x < endx; x++) \
+ { \
+ __m128i src, dst; \
+ if (!pixelmask[x]) \
+ continue; \
+ src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
+ dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
+ blend1; \
+ pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
+ }
+
+ FINISHBLEND({
+ __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
+ dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
+ }, {
+ __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
+ dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
+ });
break;
case DPSOFTRAST_BLENDMODE_ADDALPHA:
- for (x = startx;x < endx;x++)
- {
- if (!pixelmask[x])
- continue;
- a = in4ub[x*4+3];
- d[0] = (((in4ub[x*4+0]*a)>>8)+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
- d[1] = (((in4ub[x*4+1]*a)>>8)+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
- d[2] = (((in4ub[x*4+2]*a)>>8)+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
- d[3] = (((in4ub[x*4+3]*a)>>8)+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
- pixel[x*4+0] = d[0];
- pixel[x*4+1] = d[1];
- pixel[x*4+2] = d[2];
- pixel[x*4+3] = d[3];
- }
+ FINISHBLEND({
+ __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
+ dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
+ }, {
+ __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
+ dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
+ });
break;
case DPSOFTRAST_BLENDMODE_ADD:
- for (x = startx;x < endx;x++)
- {
- if (!pixelmask[x])
- continue;
- d[0] = (in4ub[x*4+0]+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
- d[1] = (in4ub[x*4+1]+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
- d[2] = (in4ub[x*4+2]+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
- d[3] = (in4ub[x*4+3]+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
- pixel[x*4+0] = d[0];
- pixel[x*4+1] = d[1];
- pixel[x*4+2] = d[2];
- pixel[x*4+3] = d[3];
- }
+ FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
break;
case DPSOFTRAST_BLENDMODE_INVMOD:
- for (x = startx;x < endx;x++)
- {
- if (!pixelmask[x])
- continue;
- pixel[x*4+0] = ((255-in4ub[x*4+0])*pixel[x*4+0])>>8;
- pixel[x*4+1] = ((255-in4ub[x*4+1])*pixel[x*4+1])>>8;
- pixel[x*4+2] = ((255-in4ub[x*4+2])*pixel[x*4+2])>>8;
- pixel[x*4+3] = ((255-in4ub[x*4+3])*pixel[x*4+3])>>8;
- }
+ FINISHBLEND({
+ dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
+ }, {
+ dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
+ });
break;
case DPSOFTRAST_BLENDMODE_MUL:
- for (x = startx;x < endx;x++)
- {
- if (!pixelmask[x])
- continue;
- pixel[x*4+0] = (in4ub[x*4+0]*pixel[x*4+0])>>8;
- pixel[x*4+1] = (in4ub[x*4+1]*pixel[x*4+1])>>8;
- pixel[x*4+2] = (in4ub[x*4+2]*pixel[x*4+2])>>8;
- pixel[x*4+3] = (in4ub[x*4+3]*pixel[x*4+3])>>8;
- }
+ FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
break;
case DPSOFTRAST_BLENDMODE_MUL2:
- for (x = startx;x < endx;x++)
- {
- if (!pixelmask[x])
- continue;
- d[0] = (in4ub[x*4+0]*pixel[x*4+0])>>7;if (d[0] > 255) d[0] = 255;
- d[1] = (in4ub[x*4+1]*pixel[x*4+1])>>7;if (d[1] > 255) d[1] = 255;
- d[2] = (in4ub[x*4+2]*pixel[x*4+2])>>7;if (d[2] > 255) d[2] = 255;
- d[3] = (in4ub[x*4+3]*pixel[x*4+3])>>7;if (d[3] > 255) d[3] = 255;
- pixel[x*4+0] = d[0];
- pixel[x*4+1] = d[1];
- pixel[x*4+2] = d[2];
- pixel[x*4+3] = d[3];
- }
+ FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
break;
case DPSOFTRAST_BLENDMODE_SUBALPHA:
- for (x = startx;x < endx;x++)
- {
- if (!pixelmask[x])
- continue;
- a = in4ub[x*4+3];
- d[0] = pixel[x*4+0]-((in4ub[x*4+0]*a)>>8);if (d[0] < 0) d[0] = 0;
- d[1] = pixel[x*4+1]-((in4ub[x*4+1]*a)>>8);if (d[1] < 0) d[1] = 0;
- d[2] = pixel[x*4+2]-((in4ub[x*4+2]*a)>>8);if (d[2] < 0) d[2] = 0;
- d[3] = pixel[x*4+3]-((in4ub[x*4+3]*a)>>8);if (d[3] < 0) d[3] = 0;
- pixel[x*4+0] = d[0];
- pixel[x*4+1] = d[1];
- pixel[x*4+2] = d[2];
- pixel[x*4+3] = d[3];
- }
+ FINISHBLEND({
+ __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
+ dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
+ }, {
+ __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
+ dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
+ });
break;
case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
- for (x = startx;x < endx;x++)
- {
- if (!pixelmask[x])
- continue;
- b = 255 - in4ub[x*4+3];
- d[0] = in4ub[x*4+0]+((pixel[x*4+0]*b)>>8);if (d[0] > 255) d[0] = 255;
- d[1] = in4ub[x*4+1]+((pixel[x*4+1]*b)>>8);if (d[1] > 255) d[1] = 255;
- d[2] = in4ub[x*4+2]+((pixel[x*4+2]*b)>>8);if (d[2] > 255) d[2] = 255;
- d[3] = in4ub[x*4+3]+((pixel[x*4+3]*b)>>8);if (d[3] > 255) d[3] = 255;
- pixel[x*4+0] = d[0];
- pixel[x*4+1] = d[1];
- pixel[x*4+2] = d[2];
- pixel[x*4+3] = d[3];
- }
+ FINISHBLEND({
+ __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
+ dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
+ }, {
+ __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
+ dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
+ });
break;
}
+#endif
}
void DPSOFTRAST_Draw_Span_Texture2DVarying(const DPSOFTRAST_State_Draw_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
float c[4];
float data[4];
float slope[4];
- float tc[2];
+ float tc[2], endtc[2];
float tcscale[2];
unsigned int tci[2];
unsigned int tci1[2];
return;
}
mip = span->mip[texunitindex];
+ pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
// if this mipmap of the texture is 1 pixel, just fill it with that color
if (texture->mipmap[mip][1] == 4)
{
slope[2] = span->data[1][arrayindex][2];
slope[3] = span->data[1][arrayindex][3];
flags = texture->flags;
- pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
tcscale[0] = texture->mipmap[mip][2];
tcscale[1] = texture->mipmap[mip][3];
tciwidth = texture->mipmap[mip][2];
tcimax[1] = texture->mipmap[mip][3]-1;
tciwrapmask[0] = texture->mipmap[mip][2]-1;
tciwrapmask[1] = texture->mipmap[mip][3]-1;
+ endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
+ endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
for (x = startx;x < endx;)
{
- float endtc[2];
unsigned int subtc[2];
unsigned int substep[2];
- int endsub = x + DPSOFTRAST_MAXSUBSPAN-1;
- float subscale = 4096.0f/(DPSOFTRAST_MAXSUBSPAN-1);
- if (endsub >= endx)
+ float subscale = 65536.0f/DPSOFTRAST_MAXSUBSPAN;
+ int nextsub = x + DPSOFTRAST_MAXSUBSPAN, endsub = nextsub - 1;
+ if(nextsub >= endx)
{
- endsub = endx-1;
- subscale = endsub > x ? 4096.0f / (endsub - x) : 1.0f;
+ nextsub = endsub = endx-1;
+ if(x < nextsub) subscale = 65536.0f / (nextsub - x);
}
- tc[0] = (data[0] + slope[0]*x) * zf[x] * tcscale[0] - 0.5f;
- tc[1] = (data[1] + slope[1]*x) * zf[x] * tcscale[1] - 0.5f;
- endtc[0] = (data[0] + slope[0]*endsub) * zf[endsub] * tcscale[0] - 0.5f;
- endtc[1] = (data[1] + slope[1]*endsub) * zf[endsub] * tcscale[1] - 0.5f;
+ tc[0] = endtc[0];
+ tc[1] = endtc[1];
+ endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
+ endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
substep[0] = (endtc[0] - tc[0]) * subscale;
substep[1] = (endtc[1] - tc[1]) * subscale;
- subtc[0] = tc[0] * (1<<12);
- subtc[1] = tc[1] * (1<<12);
- if (!(flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE))
- {
- subtc[0] &= (tciwrapmask[0]<<12)|0xFFF;
- subtc[1] &= (tciwrapmask[1]<<12)|0xFFF;
- }
+ subtc[0] = tc[0] * (1<<16);
+ subtc[1] = tc[1] * (1<<16);
if(filter)
{
- tci[0] = (subtc[0]>>12) - tcimin[0];
- tci[1] = (subtc[1]>>12) - tcimin[1];
- tci1[0] = ((subtc[0] + (endsub - x)*substep[0])>>12);
- tci1[1] = ((subtc[1] + (endsub - x)*substep[1])>>12);
- if (tci[0] <= tcimax[0]-1 && tci[1] <= tcimax[1]-1 && tci1[0] <= tcimax[0]-1 && tci1[1] <= tcimax[1]-1)
- {
- for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
- {
- unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
- unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
- unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
- tci[0] = subtc[0]>>12;
- tci[1] = subtc[1]>>12;
- pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
- pixel[1] = pixel[0] + 4 * tciwidth;
- c[0] = (pixel[0][2]*lerp[0]+pixel[0][4+2]*lerp[1]+pixel[1][2]*lerp[2]+pixel[1][4+2]*lerp[3]) * (1.0f / 0xFF000000);
- c[1] = (pixel[0][1]*lerp[0]+pixel[0][4+1]*lerp[1]+pixel[1][1]*lerp[2]+pixel[1][4+1]*lerp[3]) * (1.0f / 0xFF000000);
- c[2] = (pixel[0][0]*lerp[0]+pixel[0][4+0]*lerp[1]+pixel[1][0]*lerp[2]+pixel[1][4+0]*lerp[3]) * (1.0f / 0xFF000000);
- c[3] = (pixel[0][3]*lerp[0]+pixel[0][4+3]*lerp[1]+pixel[1][3]*lerp[2]+pixel[1][4+3]*lerp[3]) * (1.0f / 0xFF000000);
- out4f[x*4+0] = c[0];
- out4f[x*4+1] = c[1];
- out4f[x*4+2] = c[2];
- out4f[x*4+3] = c[3];
- }
- }
- else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
+ if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
{
for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
{
unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
- tci[0] = subtc[0]>>12;
- tci[1] = subtc[1]>>12;
+ tci[0] = subtc[0]>>16;
+ tci[1] = subtc[1]>>16;
tci1[0] = tci[0] + 1;
tci1[1] = tci[1] + 1;
tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
- tci[0] = subtc[0]>>12;
- tci[1] = subtc[1]>>12;
+ tci[0] = subtc[0]>>16;
+ tci[1] = subtc[1]>>16;
tci1[0] = tci[0] + 1;
tci1[1] = tci[1] + 1;
tci[0] &= tciwrapmask[0];
{
for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
{
- tci[0] = subtc[0]>>12;
- tci[1] = subtc[1]>>12;
+ tci[0] = subtc[0]>>16;
+ tci[1] = subtc[1]>>16;
tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
{
for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
{
- tci[0] = subtc[0]>>12;
- tci[1] = subtc[1]>>12;
+ tci[0] = subtc[0]>>16;
+ tci[1] = subtc[1]>>16;
tci[0] &= tciwrapmask[0];
tci[1] &= tciwrapmask[1];
pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(const DPSOFTRAST_State_Draw_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
{
+#ifdef SSE2_PRESENT
int x;
int startx = span->startx;
int endx = span->endx;
int flags;
- float data[4];
- float slope[4];
- float tc[2];
- float tcscale[2];
- unsigned int tci[2];
- unsigned int tci1[2];
- unsigned int tcimin[2];
- unsigned int tcimax[2];
- int tciwrapmask[2];
- int tciwidth;
+ __m128 data, slope, tcscale;
+ __m128i tcsize, tcmask, tcoffset, tcmax;
+ __m128 tc, endtc;
+ __m128i subtc, substep, endsubtc;
int filter;
int mip;
- unsigned int k;
unsigned int *outi = (unsigned int *)out4ub;
const unsigned char * RESTRICT pixelbase;
- const unsigned int * RESTRICT pixelbasei;
- const unsigned char * RESTRICT pixel[4];
DPSOFTRAST_Texture *texture = dpsoftrast.texbound[texunitindex];
// if no texture is bound, just fill it with white
if (!texture)
{
- memset(out4ub, 255, span->length*4);
+ memset(out4ub + startx*4, 255, span->length*4);
return;
}
mip = span->mip[texunitindex];
+ pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
// if this mipmap of the texture is 1 pixel, just fill it with that color
if (texture->mipmap[mip][1] == 4)
{
- k = *((const unsigned int *)texture->bytes);
+ unsigned int k = *((const unsigned int *)pixelbase);
for (x = startx;x < endx;x++)
outi[x] = k;
return;
}
filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
- data[0] = span->data[0][arrayindex][0];
- data[1] = span->data[0][arrayindex][1];
- data[2] = span->data[0][arrayindex][2];
- data[3] = span->data[0][arrayindex][3];
- slope[0] = span->data[1][arrayindex][0];
- slope[1] = span->data[1][arrayindex][1];
- slope[2] = span->data[1][arrayindex][2];
- slope[3] = span->data[1][arrayindex][3];
+ data = _mm_load_ps(span->data[0][arrayindex]);
+ slope = _mm_load_ps(span->data[1][arrayindex]);
flags = texture->flags;
- pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
- pixelbasei = (const unsigned int *)pixelbase;
- tcscale[0] = texture->mipmap[mip][2];
- tcscale[1] = texture->mipmap[mip][3];
- tciwidth = texture->mipmap[mip][2];
- tcimin[0] = 0;
- tcimin[1] = 0;
- tcimax[0] = texture->mipmap[mip][2]-1;
- tcimax[1] = texture->mipmap[mip][3]-1;
- tciwrapmask[0] = texture->mipmap[mip][2]-1;
- tciwrapmask[1] = texture->mipmap[mip][3]-1;
+ tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
+ tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
+ tcscale = _mm_cvtepi32_ps(tcsize);
+ data = _mm_mul_ps(_mm_shuffle_ps(data, data, _MM_SHUFFLE(1, 0, 1, 0)), tcscale);
+ slope = _mm_mul_ps(_mm_shuffle_ps(slope, slope, _MM_SHUFFLE(1, 0, 1, 0)), tcscale);
+ endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
+ endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
+ tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
+ tcmax = filter ? _mm_packs_epi32(tcmask, tcmask) : _mm_slli_epi32(tcmask, 16);
for (x = startx;x < endx;)
{
- float endtc[2];
- unsigned int subtc[2];
- unsigned int substep[2];
- int endsub = x + DPSOFTRAST_MAXSUBSPAN-1;
- float subscale = 4096.0f/(DPSOFTRAST_MAXSUBSPAN-1);
- if (endsub >= endx)
- {
- endsub = endx-1;
- subscale = endsub > x ? 4096.0f / (endsub - x) : 1.0f;
- }
- tc[0] = (data[0] + slope[0]*x) * zf[x] * tcscale[0] - 0.5f;
- tc[1] = (data[1] + slope[1]*x) * zf[x] * tcscale[1] - 0.5f;
- endtc[0] = (data[0] + slope[0]*endsub) * zf[endsub] * tcscale[0] - 0.5f;
- endtc[1] = (data[1] + slope[1]*endsub) * zf[endsub] * tcscale[1] - 0.5f;
- substep[0] = (endtc[0] - tc[0]) * subscale;
- substep[1] = (endtc[1] - tc[1]) * subscale;
- subtc[0] = tc[0] * (1<<12);
- subtc[1] = tc[1] * (1<<12);
- if (!(flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE))
- {
- subtc[0] &= (tciwrapmask[0]<<12)|0xFFF;
- subtc[1] &= (tciwrapmask[1]<<12)|0xFFF;
- }
-#if 0
-// LordHavoc: an attempt at reducing number of integer multiplies, did not show any improvement in benchmarks, abandoned.
- if (filter && dpsoftrast_test)
+ int nextsub = x + DPSOFTRAST_MAXSUBSPAN, endsub = nextsub - 1;
+ __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_MAXSUBSPAN);
+ if(nextsub >= endx)
+ {
+ nextsub = endsub = endx-1;
+ if(x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
+ }
+ tc = endtc;
+ subtc = endsubtc;
+ endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
+ substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
+ endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
+ subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
+ substep = _mm_slli_epi32(substep, 1);
+ if (filter)
{
- const unsigned int * RESTRICT pixeli[4];
- tci[0] = (subtc[0]>>12) - tcimin[0];
- tci[1] = (subtc[1]>>12) - tcimin[1];
- tci1[0] = ((subtc[0] + (endsub - x)*substep[0])>>12);
- tci1[1] = ((subtc[1] + (endsub - x)*substep[1])>>12);
- if (tci[0] <= tcimax[0]-1 && tci[1] <= tcimax[1]-1 && tci1[0] <= tcimax[0]-1 && tci1[1] <= tcimax[1]-1)
+ __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
+ if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
{
- for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
+ for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
{
- unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
- unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
- unsigned int lerp[4] = { (ifrac[0]*ifrac[1]) >> 16, (frac[0]*ifrac[1]) >> 16, (ifrac[0]*frac[1]) >> 16, (frac[0]*frac[1]) >> 16 };
- tci[0] = subtc[0]>>12;
- tci[1] = subtc[1]>>12;
- pixeli[0] = pixelbasei + (tci[1]*tciwidth+tci[0]);
- pixeli[1] = pixeli[0] + tciwidth;
- outi[x] = ((((pixeli[0][0] >> 8) & 0x00FF00FF) * lerp[0] + ((pixeli[0][1] >> 8) & 0x00FF00FF) * lerp[1] + ((pixeli[1][0] >> 8) & 0x00FF00FF) * lerp[2] + ((pixeli[1][1] >> 8) & 0x00FF00FF) * lerp[3]) & 0xFF00FF00)
- | ((((pixeli[0][0] & 0x00FF00FF) * lerp[0] + ( pixeli[0][1] & 0x00FF00FF) * lerp[1] + ( pixeli[1][0] & 0x00FF00FF) * lerp[2] + ( pixeli[1][1] & 0x00FF00FF) * lerp[3])>>8) & 0x00FF00FF);
+ __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
+ tci = _mm_madd_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 0x10000, 0, 0x10000)), tcoffset);
+ pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[_mm_cvtsi128_si32(tci)]), _mm_setzero_si128());
+ pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))]), _mm_setzero_si128());
+ pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), _mm_setzero_si128());
+ pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))]), _mm_setzero_si128());
+ fracm = _mm_srli_epi16(subtc, 1);
+ pix1 = _mm_add_epi16(pix1,
+ _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
+ _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
+ pix3 = _mm_add_epi16(pix3,
+ _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
+ _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
+ pix2 = _mm_unpacklo_epi64(pix1, pix3);
+ pix4 = _mm_unpackhi_epi64(pix1, pix3);
+ pix2 = _mm_add_epi16(pix2,
+ _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
+ _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
+ _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
}
- }
- else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
- {
- for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
+ if (x <= endsub)
{
- unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
- unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
- unsigned int lerp[4] = { (ifrac[0]*ifrac[1]) >> 16, (frac[0]*ifrac[1]) >> 16, (ifrac[0]*frac[1]) >> 16, (frac[0]*frac[1]) >> 16 };
- tci[0] = subtc[0]>>12;
- tci[1] = subtc[1]>>12;
- tci1[0] = tci[0] + 1;
- tci1[1] = tci[1] + 1;
- tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
- tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
- tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
- tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
- pixeli[0] = pixelbasei + (tci[1]*tciwidth+tci[0]);
- pixeli[1] = pixelbasei + (tci[1]*tciwidth+tci1[0]);
- pixeli[2] = pixelbasei + (tci1[1]*tciwidth+tci[0]);
- pixeli[3] = pixelbasei + (tci1[1]*tciwidth+tci1[0]);
- outi[x] = ((((pixeli[0][0] >> 8) & 0x00FF00FF) * lerp[0] + ((pixeli[1][0] >> 8) & 0x00FF00FF) * lerp[1] + ((pixeli[2][0] >> 8) & 0x00FF00FF) * lerp[2] + ((pixeli[3][0] >> 8) & 0x00FF00FF) * lerp[3]) & 0xFF00FF00)
- | ((((pixeli[0][0] & 0x00FF00FF) * lerp[0] + ( pixeli[1][0] & 0x00FF00FF) * lerp[1] + ( pixeli[2][0] & 0x00FF00FF) * lerp[2] + ( pixeli[3][0] & 0x00FF00FF) * lerp[3])>>8) & 0x00FF00FF);
+ __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
+ tci = _mm_madd_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 0x10000, 0, 0)), tcoffset);
+ pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[_mm_cvtsi128_si32(tci)]), _mm_setzero_si128());
+ pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))]), _mm_setzero_si128());
+ fracm = _mm_srli_epi16(subtc, 1);
+ pix1 = _mm_add_epi16(pix1,
+ _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
+ _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
+ pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
+ pix1 = _mm_add_epi16(pix1,
+ _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
+ _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
+ outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
+ x++;
}
}
- else
+ else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
{
- for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
+ for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
{
- unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
- unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
- unsigned int lerp[4] = { (ifrac[0]*ifrac[1]) >> 16, (frac[0]*ifrac[1]) >> 16, (ifrac[0]*frac[1]) >> 16, (frac[0]*frac[1]) >> 16 };
- tci[0] = subtc[0]>>12;
- tci[1] = subtc[1]>>12;
- tci1[0] = tci[0] + 1;
- tci1[1] = tci[1] + 1;
- tci[0] &= tciwrapmask[0];
- tci[1] &= tciwrapmask[1];
- tci1[0] &= tciwrapmask[0];
- tci1[1] &= tciwrapmask[1];
- pixeli[0] = pixelbasei + (tci[1]*tciwidth+tci[0]);
- pixeli[1] = pixelbasei + (tci[1]*tciwidth+tci1[0]);
- pixeli[2] = pixelbasei + (tci1[1]*tciwidth+tci[0]);
- pixeli[3] = pixelbasei + (tci1[1]*tciwidth+tci1[0]);
- outi[x] = ((((pixeli[0][0] >> 8) & 0x00FF00FF) * lerp[0] + ((pixeli[1][0] >> 8) & 0x00FF00FF) * lerp[1] + ((pixeli[2][0] >> 8) & 0x00FF00FF) * lerp[2] + ((pixeli[3][0] >> 8) & 0x00FF00FF) * lerp[3]) & 0xFF00FF00)
- | ((((pixeli[0][0] & 0x00FF00FF) * lerp[0] + ( pixeli[1][0] & 0x00FF00FF) * lerp[1] + ( pixeli[2][0] & 0x00FF00FF) * lerp[2] + ( pixeli[3][0] & 0x00FF00FF) * lerp[3])>>8) & 0x00FF00FF);
+ __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
+ tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
+ tci = _mm_madd_epi16(tci, tcoffset);
+ pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
+ _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
+ _mm_setzero_si128());
+ pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
+ _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
+ _mm_setzero_si128());
+ tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
+ tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
+ tci = _mm_madd_epi16(tci, tcoffset);
+ pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
+ _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
+ _mm_setzero_si128());
+ pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
+ _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
+ _mm_setzero_si128());
+ fracm = _mm_srli_epi16(subtc, 1);
+ pix1 = _mm_add_epi16(pix1,
+ _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
+ _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
+ pix3 = _mm_add_epi16(pix3,
+ _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
+ _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
+ pix2 = _mm_unpacklo_epi64(pix1, pix3);
+ pix4 = _mm_unpackhi_epi64(pix1, pix3);
+ pix2 = _mm_add_epi16(pix2,
+ _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
+ _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
+ _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
}
- }
- }
- else
-#endif
- if (filter)
- {
- tci[0] = (subtc[0]>>12) - tcimin[0];
- tci[1] = (subtc[1]>>12) - tcimin[1];
- tci1[0] = ((subtc[0] + (endsub - x)*substep[0])>>12);
- tci1[1] = ((subtc[1] + (endsub - x)*substep[1])>>12);
- if (tci[0] <= tcimax[0]-1 && tci[1] <= tcimax[1]-1 && tci1[0] <= tcimax[0]-1 && tci1[1] <= tcimax[1]-1)
- {
- for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
+ if (x <= endsub)
{
- unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
- unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
- unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
- tci[0] = subtc[0]>>12;
- tci[1] = subtc[1]>>12;
- pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
- pixel[1] = pixel[0] + 4 * tciwidth;
- out4ub[x*4+0] = (pixel[0][0]*lerp[0]+pixel[0][4+0]*lerp[1]+pixel[1][0]*lerp[2]+pixel[1][4+0]*lerp[3]) >> 24;
- out4ub[x*4+1] = (pixel[0][1]*lerp[0]+pixel[0][4+1]*lerp[1]+pixel[1][1]*lerp[2]+pixel[1][4+1]*lerp[3]) >> 24;
- out4ub[x*4+2] = (pixel[0][2]*lerp[0]+pixel[0][4+2]*lerp[1]+pixel[1][2]*lerp[2]+pixel[1][4+2]*lerp[3]) >> 24;
- out4ub[x*4+3] = (pixel[0][3]*lerp[0]+pixel[0][4+3]*lerp[1]+pixel[1][3]*lerp[2]+pixel[1][4+3]*lerp[3]) >> 24;
+ __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
+ tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
+ tci = _mm_madd_epi16(tci, tcoffset);
+ pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
+ _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
+ _mm_setzero_si128());
+ pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
+ _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
+ _mm_setzero_si128());
+ fracm = _mm_srli_epi16(subtc, 1);
+ pix1 = _mm_add_epi16(pix1,
+ _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
+ _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
+ pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
+ pix1 = _mm_add_epi16(pix1,
+ _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
+ _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
+ outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
+ x++;
}
}
- else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
+ else
{
- for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
+ for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
{
- unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
- unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
- unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
- tci[0] = subtc[0]>>12;
- tci[1] = subtc[1]>>12;
- tci1[0] = tci[0] + 1;
- tci1[1] = tci[1] + 1;
- tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
- tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
- tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
- tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
- pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
- pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
- pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
- pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
- out4ub[x*4+0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) >> 24;
- out4ub[x*4+1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) >> 24;
- out4ub[x*4+2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) >> 24;
- out4ub[x*4+3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) >> 24;
+ __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
+ tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
+ tci = _mm_madd_epi16(tci, tcoffset);
+ pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
+ _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
+ _mm_setzero_si128());
+ pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
+ _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
+ _mm_setzero_si128());
+ tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
+ tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
+ tci = _mm_madd_epi16(tci, tcoffset);
+ pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
+ _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
+ _mm_setzero_si128());
+ pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
+ _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
+ _mm_setzero_si128());
+ fracm = _mm_srli_epi16(subtc, 1);
+ pix1 = _mm_add_epi16(pix1,
+ _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
+ _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
+ pix3 = _mm_add_epi16(pix3,
+ _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
+ _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
+ pix2 = _mm_unpacklo_epi64(pix1, pix3);
+ pix4 = _mm_unpackhi_epi64(pix1, pix3);
+ pix2 = _mm_add_epi16(pix2,
+ _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
+ _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
+ _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
}
- }
- else
- {
- for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
+ if (x <= endsub)
{
- unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
- unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
- unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
- tci[0] = subtc[0]>>12;
- tci[1] = subtc[1]>>12;
- tci1[0] = tci[0] + 1;
- tci1[1] = tci[1] + 1;
- tci[0] &= tciwrapmask[0];
- tci[1] &= tciwrapmask[1];
- tci1[0] &= tciwrapmask[0];
- tci1[1] &= tciwrapmask[1];
- pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
- pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
- pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
- pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
- out4ub[x*4+0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) >> 24;
- out4ub[x*4+1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) >> 24;
- out4ub[x*4+2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) >> 24;
- out4ub[x*4+3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) >> 24;
+ __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
+ tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
+ tci = _mm_madd_epi16(tci, tcoffset);
+ pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
+ _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
+ _mm_setzero_si128());
+ pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
+ _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
+ _mm_setzero_si128());
+ fracm = _mm_srli_epi16(subtc, 1);
+ pix1 = _mm_add_epi16(pix1,
+ _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
+ _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
+ pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
+ pix1 = _mm_add_epi16(pix1,
+ _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
+ _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
+ outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
+ x++;
}
}
}
else
{
- tci[0] = (subtc[0]>>12) - tcimin[0];
- tci[1] = (subtc[1]>>12) - tcimin[1];
- tci1[0] = ((subtc[0] + (endsub - x)*substep[0])>>12);
- tci1[1] = ((subtc[1] + (endsub - x)*substep[1])>>12);
- if (tci[0] <= tcimax[0]-1 && tci[1] <= tcimax[1]-1 && tci1[0] <= tcimax[0]-1 && tci1[1] <= tcimax[1]-1)
+ if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
{
- for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
+ for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
{
- tci[0] = subtc[0]>>12;
- tci[1] = subtc[1]>>12;
- outi[x] = pixelbasei[(tci[1]*tciwidth+tci[0])];
+ __m128i tci = _mm_min_epi16(_mm_max_epi16(subtc, _mm_setzero_si128()), tcmax);
+ tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(tci, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
+ tci = _mm_madd_epi16(tci, tcoffset);
+ outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
+ outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))];
}
- }
- else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
- {
- for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
+ if (x <= endsub)
{
- tci[0] = subtc[0]>>12;
- tci[1] = subtc[1]>>12;
- tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
- tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
- outi[x] = pixelbasei[(tci[1]*tciwidth+tci[0])];
+ __m128i tci = _mm_min_epi16(_mm_max_epi16(subtc, _mm_setzero_si128()), tcmax);
+ tci = _mm_shufflelo_epi16(tci, _MM_SHUFFLE(3, 1, 3, 1));
+ tci = _mm_madd_epi16(tci, tcoffset);
+ outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
+ x++;
}
}
else
{
- for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
+ for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
{
- tci[0] = subtc[0]>>12;
- tci[1] = subtc[1]>>12;
- tci[0] &= tciwrapmask[0];
- tci[1] &= tciwrapmask[1];
- outi[x] = pixelbasei[(tci[1]*tciwidth+tci[0])];
+ __m128i tci = _mm_and_si128(subtc, tcmax);
+ tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(tci, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
+ tci = _mm_madd_epi16(tci, tcoffset);
+ outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
+ outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))];
+ }
+ if (x <= endsub)
+ {
+ __m128i tci = _mm_and_si128(subtc, tcmax);
+ tci = _mm_shufflelo_epi16(tci, _MM_SHUFFLE(3, 1, 3, 1));
+ tci = _mm_madd_epi16(tci, tcoffset);
+ outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
+ x++;
}
}
}
}
+#endif
}
void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Draw_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Draw_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
{
+#ifdef SSE2_PRESENT
int x;
int startx = span->startx;
int endx = span->endx;
- float data[4];
- float slope[4];
- float z;
- data[2] = span->data[0][arrayindex][0];
- data[1] = span->data[0][arrayindex][1];
- data[0] = span->data[0][arrayindex][2];
- data[3] = span->data[0][arrayindex][3];
- slope[2] = span->data[1][arrayindex][0];
- slope[1] = span->data[1][arrayindex][1];
- slope[0] = span->data[1][arrayindex][2];
- slope[3] = span->data[1][arrayindex][3];
- for (x = startx;x < endx;x++)
- {
- z = zf[x];
- out4ub[x*4+0] = (int)(in4ub[x*4+0] * (data[0] + slope[0]*x) * z);
- out4ub[x*4+1] = (int)(in4ub[x*4+1] * (data[1] + slope[1]*x) * z);
- out4ub[x*4+2] = (int)(in4ub[x*4+2] * (data[2] + slope[2]*x) * z);
- out4ub[x*4+3] = (int)(in4ub[x*4+3] * (data[3] + slope[3]*x) * z);
+ __m128 data = _mm_load_ps(span->data[0][arrayindex]), slope = _mm_load_ps(span->data[1][arrayindex]);
+ data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
+ slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
+ data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
+ data = _mm_mul_ps(data, _mm_set1_ps(256.0f));
+ slope = _mm_mul_ps(slope, _mm_set1_ps(256.0f));
+ for (x = startx;x+2 <= endx;x += 2, data = _mm_add_ps(data, slope))
+ {
+ __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
+ __m128i mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x]))), mod2;
+ data = _mm_add_ps(data, slope);
+ mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x+1])));
+ mod = _mm_unpacklo_epi64(_mm_packs_epi32(mod, mod), _mm_packs_epi32(mod2, mod2));
+ pix = _mm_mulhi_epu16(pix, mod);
+ _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
+ }
+ for (;x < endx;x++, data = _mm_add_ps(data, slope))
+ {
+ __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
+ __m128i mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x])));
+ mod = _mm_packs_epi32(mod, mod);
+ pix = _mm_mulhi_epu16(pix, mod);
+ *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
}
+#endif
}
void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Draw_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
{
+#ifdef SSE2_PRESENT
int x;
int startx = span->startx;
int endx = span->endx;
- float data[4];
- float slope[4];
- float z;
- data[2] = span->data[0][arrayindex][0]*255.0f;
- data[1] = span->data[0][arrayindex][1]*255.0f;
- data[0] = span->data[0][arrayindex][2]*255.0f;
- data[3] = span->data[0][arrayindex][3]*255.0f;
- slope[2] = span->data[1][arrayindex][0]*255.0f;
- slope[1] = span->data[1][arrayindex][1]*255.0f;
- slope[0] = span->data[1][arrayindex][2]*255.0f;
- slope[3] = span->data[1][arrayindex][3]*255.0f;
- for (x = startx;x < endx;x++)
- {
- z = zf[x];
- out4ub[x*4+0] = (int)((data[0] + slope[0]*x) * z);
- out4ub[x*4+1] = (int)((data[1] + slope[1]*x) * z);
- out4ub[x*4+2] = (int)((data[2] + slope[2]*x) * z);
- out4ub[x*4+3] = (int)((data[3] + slope[3]*x) * z);
+ __m128 data = _mm_load_ps(span->data[0][arrayindex]), slope = _mm_load_ps(span->data[1][arrayindex]);
+ data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
+ slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
+ data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
+ data = _mm_mul_ps(data, _mm_set1_ps(255.0f));
+ slope = _mm_mul_ps(slope, _mm_set1_ps(255.0f));
+ for (x = startx;x+2 <= endx;x += 2, data = _mm_add_ps(data, slope))
+ {
+ __m128i pix = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x]))), pix2;
+ data = _mm_add_ps(data, slope);
+ pix2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x+1])));
+ pix = _mm_unpacklo_epi64(_mm_packs_epi32(pix, pix), _mm_packs_epi32(pix2, pix2));
+ _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
+ }
+ for (;x < endx;x++, data = _mm_add_ps(data, slope))
+ {
+ __m128i pix = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x])));
+ pix = _mm_packs_epi32(pix, pix);
+ *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
}
+#endif
}
void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Draw_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
{
+#ifdef SSE2_PRESENT
int x, startx = span->startx, endx = span->endx;
- int c[4], localcolor[4];
- localcolor[2] = (int)(subcolor[0] * 255.0f);
- localcolor[1] = (int)(subcolor[1] * 255.0f);
- localcolor[0] = (int)(subcolor[2] * 255.0f);
- localcolor[3] = (int)(subcolor[3] * 255.0f);
- for (x = startx;x < endx;x++)
+ __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
+ localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
+ for (x = startx;x+2 <= endx;x+=2)
+ {
+ __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
+ __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
+ pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
+ _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
+ }
+ if(x < endx)
{
- c[0] = inb4ub[x*4+0] - localcolor[0];if (c[0] < 0) c[0] = 0;
- c[1] = inb4ub[x*4+1] - localcolor[1];if (c[1] < 0) c[1] = 0;
- c[2] = inb4ub[x*4+2] - localcolor[2];if (c[2] < 0) c[2] = 0;
- c[3] = inb4ub[x*4+3] - localcolor[3];if (c[3] < 0) c[3] = 0;
- c[0] += ina4ub[x*4+0];if (c[0] > 255) c[0] = 255;
- c[1] += ina4ub[x*4+1];if (c[1] > 255) c[1] = 255;
- c[2] += ina4ub[x*4+2];if (c[2] > 255) c[2] = 255;
- c[3] += ina4ub[x*4+3];if (c[3] > 255) c[3] = 255;
- out4ub[x*4+0] = c[0];
- out4ub[x*4+1] = c[1];
- out4ub[x*4+2] = c[2];
- out4ub[x*4+3] = c[3];
+ __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
+ __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
+ pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
+ *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
}
+#endif
}
void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Draw_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
{
+#ifdef SSE2_PRESENT
int x, startx = span->startx, endx = span->endx;
- for (x = startx;x < endx;x++)
+ for (x = startx;x+2 <= endx;x+=2)
+ {
+ __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
+ __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
+ pix1 = _mm_mulhi_epu16(pix1, pix2);
+ _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
+ }
+ if(x < endx)
{
- out4ub[x*4+0] = (ina4ub[x*4+0] * inb4ub[x*4+0])>>8;
- out4ub[x*4+1] = (ina4ub[x*4+1] * inb4ub[x*4+1])>>8;
- out4ub[x*4+2] = (ina4ub[x*4+2] * inb4ub[x*4+2])>>8;
- out4ub[x*4+3] = (ina4ub[x*4+3] * inb4ub[x*4+3])>>8;
+ __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
+ __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
+ pix1 = _mm_mulhi_epu16(pix1, pix2);
+ *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
}
+#endif
}
void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Draw_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
{
+#ifdef SSE2_PRESENT
int x, startx = span->startx, endx = span->endx;
- int d[4];
- for (x = startx;x < endx;x++)
+ for (x = startx;x+2 <= endx;x+=2)
{
- d[0] = ina4ub[x*4+0] + inb4ub[x*4+0];if (d[0] > 255) d[0] = 255;
- d[1] = ina4ub[x*4+1] + inb4ub[x*4+1];if (d[1] > 255) d[1] = 255;
- d[2] = ina4ub[x*4+2] + inb4ub[x*4+2];if (d[2] > 255) d[2] = 255;
- d[3] = ina4ub[x*4+3] + inb4ub[x*4+3];if (d[3] > 255) d[3] = 255;
- out4ub[x*4+0] = d[0];
- out4ub[x*4+1] = d[1];
- out4ub[x*4+2] = d[2];
- out4ub[x*4+3] = d[3];
+ __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
+ __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
+ pix1 = _mm_add_epi16(pix1, pix2);
+ _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
}
+ if(x < endx)
+ {
+ __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
+ __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
+ pix1 = _mm_add_epi16(pix1, pix2);
+ *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
+ }
+#endif
}
void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Draw_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
{
+#ifdef SSE2_PRESENT
int x, startx = span->startx, endx = span->endx;
- int d[4];
- int b[4];
- b[0] = (int)(inbtintbgra[0] * 256.0f);
- b[1] = (int)(inbtintbgra[1] * 256.0f);
- b[2] = (int)(inbtintbgra[2] * 256.0f);
- b[3] = (int)(inbtintbgra[3] * 256.0f);
- for (x = startx;x < endx;x++)
+ __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
+ tint = _mm_shuffle_epi32(_mm_packs_epi32(tint, tint), _MM_SHUFFLE(1, 0, 1, 0));
+ for (x = startx;x+2 <= endx;x+=2)
+ {
+ __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
+ __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
+ pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
+ _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
+ }
+ if(x < endx)
{
- d[0] = ina4ub[x*4+0] + ((inb4ub[x*4+0]*b[0])>>8);if (d[0] > 255) d[0] = 255;
- d[1] = ina4ub[x*4+1] + ((inb4ub[x*4+1]*b[1])>>8);if (d[1] > 255) d[1] = 255;
- d[2] = ina4ub[x*4+2] + ((inb4ub[x*4+2]*b[2])>>8);if (d[2] > 255) d[2] = 255;
- d[3] = ina4ub[x*4+3] + ((inb4ub[x*4+3]*b[3])>>8);if (d[3] > 255) d[3] = 255;
- out4ub[x*4+0] = d[0];
- out4ub[x*4+1] = d[1];
- out4ub[x*4+2] = d[2];
- out4ub[x*4+3] = d[3];
+ __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
+ __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
+ pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
+ *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
}
+#endif
}
void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Draw_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
{
+#ifdef SSE2_PRESENT
int x, startx = span->startx, endx = span->endx;
- int a, b;
- for (x = startx;x < endx;x++)
+ for (x = startx;x+2 <= endx;x+=2)
+ {
+ __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
+ __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
+ __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
+ pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
+ _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
+ }
+ if(x < endx)
{
- a = 256 - inb4ub[x*4+3];
- b = inb4ub[x*4+3];
- out4ub[x*4+0] = (ina4ub[x*4+0] * a + inb4ub[x*4+0] * b)>>8;
- out4ub[x*4+1] = (ina4ub[x*4+1] * a + inb4ub[x*4+1] * b)>>8;
- out4ub[x*4+2] = (ina4ub[x*4+2] * a + inb4ub[x*4+2] * b)>>8;
- out4ub[x*4+3] = (ina4ub[x*4+3] * a + inb4ub[x*4+3] * b)>>8;
+ __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
+ __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
+ __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
+ pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
+ *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
}
+#endif
}
void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Draw_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
{
+#ifdef SSE2_PRESENT
int x, startx = span->startx, endx = span->endx;
- int localcolor[4], ilerp, lerp;
- localcolor[2] = (int)(color[0]*255.0f);
- localcolor[1] = (int)(color[1]*255.0f);
- localcolor[0] = (int)(color[2]*255.0f);
- localcolor[3] = (int)(color[3]*255.0f);
- ilerp = 256 - localcolor[3];
- lerp = localcolor[3];
- for (x = startx;x < endx;x++)
+ __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
+ localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
+ blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
+ for (x = startx;x+2 <= endx;x+=2)
{
- out4ub[x*4+0] = (in4ub[x*4+0] * ilerp + localcolor[0] * lerp)>>8;
- out4ub[x*4+1] = (in4ub[x*4+1] * ilerp + localcolor[1] * lerp)>>8;
- out4ub[x*4+2] = (in4ub[x*4+2] * ilerp + localcolor[2] * lerp)>>8;
- out4ub[x*4+3] = (in4ub[x*4+3] * ilerp + localcolor[3] * lerp)>>8;
+ __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
+ pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
+ _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
}
+ if(x < endx)
+ {
+ __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
+ pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
+ *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
+ }
+#endif
}
void DPSOFTRAST_PixelShader_VertexColor(const DPSOFTRAST_State_Draw_Span * RESTRICT span)
{
+#ifdef SSE2_PRESENT
+ unsigned char * RESTRICT pixelmask = span->pixelmask;
+ unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + span->start * 4;
+ int x, startx = span->startx, endx = span->endx;
+ __m128i Color_Ambientm, Color_Diffusem;
+ __m128 data, slope;
float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
- int x, startx = span->startx, endx = span->endx;
- float Color_Ambient[4], Color_Diffuse[4];
- float data[4];
- float slope[4];
- float z;
int arrayindex = DPSOFTRAST_ARRAY_COLOR;
- data[2] = span->data[0][arrayindex][0];
- data[1] = span->data[0][arrayindex][1];
- data[0] = span->data[0][arrayindex][2];
- data[3] = span->data[0][arrayindex][3];
- slope[2] = span->data[1][arrayindex][0];
- slope[1] = span->data[1][arrayindex][1];
- slope[0] = span->data[1][arrayindex][2];
- slope[3] = span->data[1][arrayindex][3];
- Color_Ambient[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
- Color_Ambient[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
- Color_Ambient[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
- Color_Ambient[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
- Color_Diffuse[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
- Color_Diffuse[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
- Color_Diffuse[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
- Color_Diffuse[3] = 0.0f;
DPSOFTRAST_Draw_Span_Begin(span, buffer_z);
DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
- for (x = startx;x < endx;x++)
- {
- z = buffer_z[x];
- buffer_FragColorbgra8[x*4+0] = (int)(buffer_texture_colorbgra8[x*4+0] * (Color_Ambient[0] + ((data[0] + slope[0]*x) * z) * Color_Diffuse[0]));
- buffer_FragColorbgra8[x*4+1] = (int)(buffer_texture_colorbgra8[x*4+1] * (Color_Ambient[1] + ((data[1] + slope[1]*x) * z) * Color_Diffuse[1]));
- buffer_FragColorbgra8[x*4+2] = (int)(buffer_texture_colorbgra8[x*4+2] * (Color_Ambient[2] + ((data[2] + slope[2]*x) * z) * Color_Diffuse[2]));
- buffer_FragColorbgra8[x*4+3] = (int)(buffer_texture_colorbgra8[x*4+3] * (Color_Ambient[3] + ((data[3] + slope[3]*x) * z) * Color_Diffuse[3]));
- }
- DPSOFTRAST_Draw_Span_FinishBGRA8(span, buffer_FragColorbgra8);
+ if (dpsoftrast.user.alphatest || dpsoftrast.fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
+ pixel = buffer_FragColorbgra8;
+ Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
+ Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
+ Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
+ Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
+ Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
+ Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
+ Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
+ data = _mm_load_ps(span->data[0][arrayindex]);
+ slope = _mm_load_ps(span->data[1][arrayindex]);
+ data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
+ slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
+ data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
+ data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
+ slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
+ for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
+ {
+ __m128i color, mod, pix;
+ if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
+ {
+ __m128i pix2, mod2;
+ __m128 z = _mm_loadu_ps(&buffer_z[x]);
+ color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
+ mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
+ data = _mm_add_ps(data, slope);
+ mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
+ data = _mm_add_ps(data, slope);
+ mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
+ data = _mm_add_ps(data, slope);
+ mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
+ pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
+ _mm_unpacklo_epi8(_mm_setzero_si128(), color));
+ pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
+ _mm_unpackhi_epi8(_mm_setzero_si128(), color));
+ _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
+ x += 3;
+ continue;
+ }
+ if(!pixelmask[x])
+ continue;
+ color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
+ mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x])));
+ mod = _mm_packs_epi32(mod, mod);
+ pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
+ *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
+ }
+ if(pixel == buffer_FragColorbgra8)
+ DPSOFTRAST_Draw_Span_FinishBGRA8(span, buffer_FragColorbgra8);
+#endif
}
void DPSOFTRAST_PixelShader_Lightmap(const DPSOFTRAST_State_Draw_Span * RESTRICT span)
{
+#ifdef SSE2_PRESENT
+ unsigned char * RESTRICT pixelmask = span->pixelmask;
+ unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + span->start * 4;
int x, startx = span->startx, endx = span->endx;
- int Color_Ambienti[4], Color_Diffusei[4], Color_Glowi[4];
+ __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
- unsigned int d[4];
- //unsigned char * RESTRICT pixelmask = span->pixelmask;
- //unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + span->start * 4;
DPSOFTRAST_Draw_Span_Begin(span, buffer_z);
- Color_Ambienti[2] = (int)(dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0] * 65536.0f);
- Color_Ambienti[1] = (int)(dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1] * 65536.0f);
- Color_Ambienti[0] = (int)(dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2] * 65536.0f);
- Color_Ambienti[3] = (int)(dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0] * 65536.0f);
- Color_Diffusei[2] = (int)(dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0] * 256.0f);
- Color_Diffusei[1] = (int)(dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1] * 256.0f);
- Color_Diffusei[0] = (int)(dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2] * 256.0f);
- Color_Diffusei[3] = 0;
- Color_Glowi[2] = (int)(dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0] * 65536.0f);
- Color_Glowi[1] = (int)(dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1] * 65536.0f);
- Color_Glowi[0] = (int)(dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2] * 65536.0f);
- Color_Glowi[3] = 0;
DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
+ if (dpsoftrast.user.alphatest || dpsoftrast.fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
+ pixel = buffer_FragColorbgra8;
+ Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
+ Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
+ Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
+ Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
+ Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
+ Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
+ Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
if (dpsoftrast.shader_permutation & SHADERPERMUTATION_GLOW)
{
DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
+ Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
+ Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
+ Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
+ Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
for (x = startx;x < endx;x++)
{
- d[0] = (buffer_texture_glowbgra8[x*4+0] * Color_Glowi[0] + buffer_texture_colorbgra8[x*4+0] * (Color_Ambienti[0] + buffer_texture_lightmapbgra8[x*4+0] * Color_Diffusei[0])) >> 16;if (d[0] > 255) d[0] = 255;
- d[1] = (buffer_texture_glowbgra8[x*4+1] * Color_Glowi[1] + buffer_texture_colorbgra8[x*4+1] * (Color_Ambienti[1] + buffer_texture_lightmapbgra8[x*4+1] * Color_Diffusei[1])) >> 16;if (d[1] > 255) d[1] = 255;
- d[2] = (buffer_texture_glowbgra8[x*4+2] * Color_Glowi[2] + buffer_texture_colorbgra8[x*4+2] * (Color_Ambienti[2] + buffer_texture_lightmapbgra8[x*4+2] * Color_Diffusei[2])) >> 16;if (d[2] > 255) d[2] = 255;
- d[3] = (buffer_texture_glowbgra8[x*4+3] * Color_Glowi[3] + buffer_texture_colorbgra8[x*4+3] * (Color_Ambienti[3] + buffer_texture_lightmapbgra8[x*4+3] * Color_Diffusei[3])) >> 16;if (d[3] > 255) d[3] = 255;
- buffer_FragColorbgra8[x*4+0] = d[0];
- buffer_FragColorbgra8[x*4+1] = d[1];
- buffer_FragColorbgra8[x*4+2] = d[2];
- buffer_FragColorbgra8[x*4+3] = d[3];
+ __m128i color, lightmap, glow, pix;
+ if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
+ {
+ __m128i pix2;
+ color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
+ lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
+ glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
+ pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
+ _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
+ _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
+ pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
+ _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
+ _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
+ _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
+ x += 3;
+ continue;
+ }
+ if(!pixelmask[x])
+ continue;
+ color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
+ lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
+ glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
+ pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
+ pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
+ *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
}
}
else
{
for (x = startx;x < endx;x++)
{
- d[0] = (buffer_texture_colorbgra8[x*4+0] * (Color_Ambienti[0] + buffer_texture_lightmapbgra8[x*4+0] * Color_Diffusei[0])) >> 16;if (d[0] > 255) d[0] = 255;
- d[1] = (buffer_texture_colorbgra8[x*4+1] * (Color_Ambienti[1] + buffer_texture_lightmapbgra8[x*4+1] * Color_Diffusei[1])) >> 16;if (d[1] > 255) d[1] = 255;
- d[2] = (buffer_texture_colorbgra8[x*4+2] * (Color_Ambienti[2] + buffer_texture_lightmapbgra8[x*4+2] * Color_Diffusei[2])) >> 16;if (d[2] > 255) d[2] = 255;
- d[3] = (buffer_texture_colorbgra8[x*4+3] * (Color_Ambienti[3] + buffer_texture_lightmapbgra8[x*4+3] * Color_Diffusei[3])) >> 16;if (d[3] > 255) d[3] = 255;
- buffer_FragColorbgra8[x*4+0] = d[0];
- buffer_FragColorbgra8[x*4+1] = d[1];
- buffer_FragColorbgra8[x*4+2] = d[2];
- buffer_FragColorbgra8[x*4+3] = d[3];
+ __m128i color, lightmap, pix;
+ if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
+ {
+ __m128i pix2;
+ color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
+ lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
+ pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
+ _mm_unpacklo_epi8(_mm_setzero_si128(), color));
+ pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
+ _mm_unpackhi_epi8(_mm_setzero_si128(), color));
+ _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
+ x += 3;
+ continue;
+ }
+ if(!pixelmask[x])
+ continue;
+ color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
+ lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
+ pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
+ *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
}
}
- DPSOFTRAST_Draw_Span_FinishBGRA8(span, buffer_FragColorbgra8);
+ if(pixel == buffer_FragColorbgra8)
+ DPSOFTRAST_Draw_Span_FinishBGRA8(span, buffer_FragColorbgra8);
+#endif
}
wslope = span->data[1][DPSOFTRAST_ARRAY_TOTAL][3];
if (dpsoftrast.user.depthtest && dpsoftrast.fb_depthpixels)
{
- depth = (int)(w*DPSOFTRAST_DEPTHSCALE);
depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
+ depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(dpsoftrast.user.polygonoffset[1] + fabs(wslope)*dpsoftrast.user.polygonoffset[0]));
depthpixel = dpsoftrast.fb_depthpixels + span->start;
switch(dpsoftrast.fb_depthfunc)
{
if (dpsoftrast.fb_colorpixels[0] && dpsoftrast.fb_colormask)
DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Span(span);
if (dpsoftrast.user.depthmask)
- for (x = 0, d = depth;x < span->length;x++, d += depthslope)
+ for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
if (pixelmask[x])
depthpixel[x] = d;
}
void DPSOFTRAST_Draw_ProcessTriangles(int firstvertex, int numtriangles, const int *element3i, const unsigned short *element3s, unsigned char *arraymask)
{
+#ifdef SSE2_PRESENT
int cullface = dpsoftrast.user.cullface;
int width = dpsoftrast.fb_width;
int height = dpsoftrast.fb_height;
+ __m128i fbmax = _mm_sub_epi16(_mm_setr_epi16(width, height, width, height, width, height, width, height), _mm_set1_epi16(1));
int i;
int j;
int k;
int y;
int e[3];
- int screenx[4];
- int screeny[4];
- int screenyless[4];
+ __m128i screeny;
+ int starty, endy;
int numpoints;
- int clipflags;
int edge0p;
int edge0n;
int edge1p;
int edge1n;
- int extent[6];
int startx;
int endx;
- float mip_edge0tc[2];
- float mip_edge1tc[2];
- float mip_edge0xy[2];
- float mip_edge1xy[2];
- float mip_edge0xymul;
- float mip_edge1xymul;
- float mip_edge0mip;
- float mip_edge1mip;
- float mipdensity;
unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS];
- float startxf;
- float endxf;
- float edge0ylerp;
- float edge0yilerp;
- float edge1ylerp;
- float edge1yilerp;
- float edge0xf;
- float edge1xf;
- float spanilength;
- float startxlerp;
- float yc;
- float w;
- float frac;
- float ifrac;
- float trianglearea2;
- float triangleedge[2][4];
- float trianglenormal[4];
+ __m128 mipedgescale;
float clipdist[4];
- float clipped[DPSOFTRAST_ARRAY_TOTAL][4][4];
- float screen[4][4];
- float proj[DPSOFTRAST_ARRAY_TOTAL][4][4];
+ __m128 clipfrac[4];
+ __m128 clipped[DPSOFTRAST_ARRAY_TOTAL][4];
+ __m128 screen[4];
+ __m128 proj[DPSOFTRAST_ARRAY_TOTAL][4];
DPSOFTRAST_Texture *texture;
DPSOFTRAST_State_Draw_Span *span;
DPSOFTRAST_State_Draw_Span *oldspan;
e[1] = i*3+1;
e[2] = i*3+2;
}
- triangleedge[0][0] = dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[0]*4+0] - dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[1]*4+0];
- triangleedge[0][1] = dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[0]*4+1] - dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[1]*4+1];
- triangleedge[0][2] = dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[0]*4+2] - dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[1]*4+2];
- triangleedge[1][0] = dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[2]*4+0] - dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[1]*4+0];
- triangleedge[1][1] = dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[2]*4+1] - dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[1]*4+1];
- triangleedge[1][2] = dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[2]*4+2] - dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[1]*4+2];
- trianglenormal[0] = triangleedge[0][1] * triangleedge[1][2] - triangleedge[0][2] * triangleedge[1][1];
- trianglenormal[1] = triangleedge[0][2] * triangleedge[1][0] - triangleedge[0][0] * triangleedge[1][2];
- trianglenormal[2] = triangleedge[0][0] * triangleedge[1][1] - triangleedge[0][1] * triangleedge[1][0];
- trianglearea2 = trianglenormal[0] * trianglenormal[0] + trianglenormal[1] * trianglenormal[1] + trianglenormal[2] * trianglenormal[2];
- // skip degenerate triangles, nothing good can come from them...
- if (trianglearea2 == 0.0f)
- continue;
- // apply current cullface mode (this culls many triangles)
- switch(cullface)
- {
- case GL_BACK:
- if (trianglenormal[2] < 0)
- continue;
- break;
- case GL_FRONT:
- if (trianglenormal[2] > 0)
- continue;
- break;
- }
- // calculate distance from nearplane
- clipdist[0] = dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[0]*4+2] + 1.0f;
- clipdist[1] = dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[1]*4+2] + 1.0f;
- clipdist[2] = dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[2]*4+2] + 1.0f;
- clipflags = 0;
- if (clipdist[0] < 0.0f)
- clipflags |= 1;
- if (clipdist[1] < 0.0f)
- clipflags |= 2;
- if (clipdist[2] < 0.0f)
- clipflags |= 4;
- // clip triangle if necessary
- switch(clipflags)
- {
- case 0: /*000*/
- // triangle is entirely in front of nearplane
+
+#define SKIPBACKFACE \
+ if(cullface != GL_NONE) \
+ { \
+ __m128 triangleedge[2] = { _mm_sub_ps(screen[0], screen[1]), _mm_sub_ps(screen[2], screen[1]) }; \
+ /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
+ __m128 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge[0], _mm_shuffle_ps(triangleedge[1], triangleedge[1], _MM_SHUFFLE(3, 0, 2, 1))), \
+ _mm_mul_ss(_mm_shuffle_ps(triangleedge[0], triangleedge[0], _MM_SHUFFLE(3, 0, 2, 1)), triangleedge[1])); \
+ /* apply current cullface mode (this culls many triangles) */ \
+ switch(cullface) \
+ { \
+ case GL_BACK: \
+ if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
+ continue; \
+ break; \
+ case GL_FRONT: \
+ if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
+ continue; \
+ break; \
+ } \
+ }
+ //trianglenormal = _mm_sub_ps(_mm_mul_ps(triangleedge[0], _mm_shuffle_ps(triangleedge[1], triangleedge[1], _MM_SHUFFLE(3, 0, 2, 1))),
+ // _mm_mul_ps(_mm_shuffle_ps(triangleedge[0], triangleedge[0], _MM_SHUFFLE(3, 0, 2, 1)), triangleedge[1]));
+ //trianglenormal[2] = triangleedge[0][0] * triangleedge[1][1] - triangleedge[0][1] * triangleedge[1][0];
+ //trianglenormal[0] = triangleedge[0][1] * triangleedge[1][2] - triangleedge[0][2] * triangleedge[1][1];
+ //trianglenormal[1] = triangleedge[0][2] * triangleedge[1][0] - triangleedge[0][0] * triangleedge[1][2];
// macros for clipping vertices
-#define CLIPPEDVERTEXLERP(k,p1,p2) \
- frac = clipdist[p1] / (clipdist[p1] - clipdist[p2]);\
- ifrac = 1.0f - frac;\
- for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL;j++)\
+#define CLIPPEDVERTEXLERP(k,p1, p2) \
+ clipfrac[k] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
+ { \
+ __m128 v1 = _mm_load_ps(&dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[p1]*4]), v2 = _mm_load_ps(&dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[p2]*4]); \
+ clipped[DPSOFTRAST_ARRAY_POSITION][k] = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[k])); \
+ } \
+ screen[k] = DPSOFTRAST_Draw_ProjectVertex(clipped[DPSOFTRAST_ARRAY_POSITION][k]);
+#define CLIPPEDATTRIBSLERP(k,p1,p2) \
+ for (j = DPSOFTRAST_ARRAY_POSITION+1;j < DPSOFTRAST_ARRAY_TOTAL;j++)\
{\
/*if (arraymask[j])*/\
{\
- clipped[j][k][0] = dpsoftrast.draw.post_array4f[j][e[p1]*4+0]*ifrac+dpsoftrast.draw.post_array4f[j][e[p2]*4+0]*frac;\
- clipped[j][k][1] = dpsoftrast.draw.post_array4f[j][e[p1]*4+1]*ifrac+dpsoftrast.draw.post_array4f[j][e[p2]*4+1]*frac;\
- clipped[j][k][2] = dpsoftrast.draw.post_array4f[j][e[p1]*4+2]*ifrac+dpsoftrast.draw.post_array4f[j][e[p2]*4+2]*frac;\
- clipped[j][k][3] = dpsoftrast.draw.post_array4f[j][e[p1]*4+3]*ifrac+dpsoftrast.draw.post_array4f[j][e[p2]*4+3]*frac;\
+ __m128 v1 = _mm_load_ps(&dpsoftrast.draw.post_array4f[j][e[p1]*4]), v2 = _mm_load_ps(&dpsoftrast.draw.post_array4f[j][e[p2]*4]); \
+ clipped[j][k] = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[k])); \
}\
- }\
- DPSOFTRAST_Draw_ProjectVertices(screen[k], clipped[DPSOFTRAST_ARRAY_POSITION][k], 1)
+ }
#define CLIPPEDVERTEXCOPY(k,p1) \
+ screen[k] = _mm_load_ps(&dpsoftrast.draw.screencoord4f[e[p1]*4]);
+#define CLIPPEDATTRIBSCOPY(k,p1) \
for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL;j++)\
{\
/*if (arraymask[j])*/\
{\
- clipped[j][k][0] = dpsoftrast.draw.post_array4f[j][e[p1]*4+0];\
- clipped[j][k][1] = dpsoftrast.draw.post_array4f[j][e[p1]*4+1];\
- clipped[j][k][2] = dpsoftrast.draw.post_array4f[j][e[p1]*4+2];\
- clipped[j][k][3] = dpsoftrast.draw.post_array4f[j][e[p1]*4+3];\
+ clipped[j][k] = _mm_load_ps(&dpsoftrast.draw.post_array4f[j][e[p1]*4]); \
}\
- }\
- screen[k][0] = dpsoftrast.draw.screencoord4f[e[p1]*4+0];\
- screen[k][1] = dpsoftrast.draw.screencoord4f[e[p1]*4+1];\
- screen[k][2] = dpsoftrast.draw.screencoord4f[e[p1]*4+2];\
- screen[k][3] = dpsoftrast.draw.screencoord4f[e[p1]*4+3];
-
- CLIPPEDVERTEXCOPY(0,0);
- CLIPPEDVERTEXCOPY(1,1);
- CLIPPEDVERTEXCOPY(2,2);
- numpoints = 3;
- break;
- case 1: /*100*/
- CLIPPEDVERTEXLERP(0,0,1);
- CLIPPEDVERTEXCOPY(1,1);
- CLIPPEDVERTEXCOPY(2,2);
- CLIPPEDVERTEXLERP(3,2,0);
- numpoints = 4;
- break;
- case 2: /*010*/
- CLIPPEDVERTEXCOPY(0,0);
- CLIPPEDVERTEXLERP(1,0,1);
- CLIPPEDVERTEXLERP(2,1,2);
- CLIPPEDVERTEXCOPY(3,2);
- numpoints = 4;
- break;
- case 3: /*110*/
- CLIPPEDVERTEXLERP(0,1,2);
- CLIPPEDVERTEXCOPY(1,2);
- CLIPPEDVERTEXLERP(2,2,0);
- numpoints = 3;
- break;
- case 4: /*001*/
- CLIPPEDVERTEXCOPY(0,0);
- CLIPPEDVERTEXCOPY(1,1);
- CLIPPEDVERTEXLERP(2,1,2);
- CLIPPEDVERTEXLERP(3,2,0);
- numpoints = 4;
- break;
- case 5: /*101*/
- CLIPPEDVERTEXLERP(0,0,1);
- CLIPPEDVERTEXCOPY(1,1);
- CLIPPEDVERTEXLERP(2,1,2);
- numpoints = 3;
- break;
- case 6: /*011*/
- CLIPPEDVERTEXCOPY(0,0);
- CLIPPEDVERTEXLERP(1,0,1);
- CLIPPEDVERTEXLERP(2,2,0);
- numpoints = 3;
- break;
- case 7: /*111*/
- // triangle is entirely behind nearplane
- continue;
+ }
+
+ // calculate distance from nearplane
+ clipdist[0] = dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[0]*4+2] + dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[0]*4+3];
+ clipdist[1] = dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[1]*4+2] + dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[1]*4+3];
+ clipdist[2] = dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[2]*4+2] + dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[2]*4+3];
+ if (clipdist[0] >= 0.0f)
+ {
+ if (clipdist[1] >= 0.0f)
+ {
+ if (clipdist[2] >= 0.0f)
+ {
+ // triangle is entirely in front of nearplane
+ CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
+ numpoints = 3;
+ SKIPBACKFACE;
+ CLIPPEDATTRIBSCOPY(0,0); CLIPPEDATTRIBSCOPY(1,1); CLIPPEDATTRIBSCOPY(2,2);
+ }
+ else
+ {
+ CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
+ numpoints = 4;
+ SKIPBACKFACE;
+ CLIPPEDATTRIBSCOPY(0,0); CLIPPEDATTRIBSCOPY(1,1); CLIPPEDATTRIBSLERP(2,1,2); CLIPPEDATTRIBSLERP(3,2,0);
+ }
+ }
+ else
+ {
+ if (clipdist[2] >= 0.0f)
+ {
+ CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
+ numpoints = 4;
+ SKIPBACKFACE;
+ CLIPPEDATTRIBSCOPY(0,0); CLIPPEDATTRIBSLERP(1,0,1); CLIPPEDATTRIBSLERP(2,1,2); CLIPPEDATTRIBSCOPY(3,2);
+ }
+ else
+ {
+ CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
+ numpoints = 3;
+ SKIPBACKFACE;
+ CLIPPEDATTRIBSCOPY(0,0); CLIPPEDATTRIBSLERP(1,0,1); CLIPPEDATTRIBSLERP(2,2,0);
+ }
+ }
+ }
+ else if (clipdist[1] >= 0.0f)
+ {
+ if (clipdist[2] >= 0.0f)
+ {
+ CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
+ numpoints = 4;
+ SKIPBACKFACE;
+ CLIPPEDATTRIBSLERP(0,0,1); CLIPPEDATTRIBSCOPY(1,1); CLIPPEDATTRIBSCOPY(2,2); CLIPPEDATTRIBSLERP(3,2,0);
+ }
+ else
+ {
+ CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
+ numpoints = 3;
+ SKIPBACKFACE;
+ CLIPPEDATTRIBSLERP(0,0,1); CLIPPEDATTRIBSCOPY(1,1); CLIPPEDATTRIBSLERP(2,1,2);
+ }
}
- // calculate integer y coords for triangle points
- screenx[0] = (int)(screen[0][0]);
- screeny[0] = (int)(screen[0][1]);
- screenx[1] = (int)(screen[1][0]);
- screeny[1] = (int)(screen[1][1]);
- screenx[2] = (int)(screen[2][0]);
- screeny[2] = (int)(screen[2][1]);
- screenx[3] = (int)(screen[3][0]);
- screeny[3] = (int)(screen[3][1]);
- // figure out the extents (bounding box) of the triangle
- extent[0] = screenx[0];
- extent[1] = screeny[0];
- extent[2] = screenx[0];
- extent[3] = screeny[0];
- for (j = 1;j < numpoints;j++)
+ else if (clipdist[2] >= 0.0f)
{
- if (extent[0] > screenx[j]) extent[0] = screenx[j];
- if (extent[1] > screeny[j]) extent[1] = screeny[j];
- if (extent[2] < screenx[j]) extent[2] = screenx[j];
- if (extent[3] < screeny[j]) extent[3] = screeny[j];
+ CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
+ numpoints = 3;
+ SKIPBACKFACE;
+ CLIPPEDATTRIBSLERP(0,1,2); CLIPPEDATTRIBSCOPY(1,2); CLIPPEDATTRIBSLERP(2,2,0);
+ }
+ else continue; // triangle is entirely behind nearplane
+
+ {
+ // calculate integer y coords for triangle points
+ __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_shuffle_ps(screen[0], screen[1], _MM_SHUFFLE(1, 0, 1, 0))),
+ _mm_cvttps_epi32(_mm_shuffle_ps(screen[2], numpoints <= 3 ? screen[2] : screen[3], _MM_SHUFFLE(1, 0, 1, 0)))),
+ screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
+ screenmin = _mm_min_epi16(screeni, screenir),
+ screenmax = _mm_max_epi16(screeni, screenir);
+ screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
+ screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
+ screenmin = _mm_max_epi16(screenmin, _mm_setzero_si128());
+ screenmax = _mm_min_epi16(screenmax, fbmax);
+ // skip offscreen triangles
+ if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
+ continue;
+ starty = _mm_extract_epi16(screenmin, 1);
+ endy = _mm_extract_epi16(screenmax, 1)+1;
+ screeny = _mm_srai_epi32(screeni, 16);
}
- //extent[0]--;
- //extent[1]--;
- extent[2]++;
- extent[3]++;
- if (extent[0] < 0)
- extent[0] = 0;
- if (extent[1] < 0)
- extent[1] = 0;
- if (extent[2] > width)
- extent[2] = width;
- if (extent[3] > height)
- extent[3] = height;
- // skip offscreen triangles
- if (extent[2] <= extent[0] || extent[3] <= extent[1])
- continue;
+
// okay, this triangle is going to produce spans, we'd better project
// the interpolants now (this is what gives perspective texturing),
// this consists of simply multiplying all arrays by the W coord
{
for (k = 0;k < numpoints;k++)
{
- w = screen[k][3];
- proj[j][k][0] = clipped[j][k][0] * w;
- proj[j][k][1] = clipped[j][k][1] * w;
- proj[j][k][2] = clipped[j][k][2] * w;
- proj[j][k][3] = clipped[j][k][3] * w;
+ proj[j][k] = _mm_mul_ps(clipped[j][k], _mm_shuffle_ps(screen[k], screen[k], _MM_SHUFFLE(3, 3, 3, 3)));
}
}
}
// adjust texture LOD by texture density, in the simplest way possible...
- mip_edge0xy[0] = screen[0][0] - screen[1][0];
- mip_edge0xy[1] = screen[0][1] - screen[1][1];
- mip_edge1xy[0] = screen[2][0] - screen[1][0];
- mip_edge1xy[1] = screen[2][1] - screen[1][1];
- mip_edge0xymul = 1.0f / (mip_edge0xy[0]*mip_edge0xy[0]+mip_edge0xy[1]*mip_edge0xy[1]);
- mip_edge1xymul = 1.0f / (mip_edge1xy[0]*mip_edge1xy[0]+mip_edge1xy[1]*mip_edge1xy[1]);
+ mipedgescale = _mm_sub_ps(_mm_shuffle_ps(screen[0], screen[2], _MM_SHUFFLE(1, 0, 1, 0)), _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 0, 1, 0)));
+ mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
+ mipedgescale = _mm_div_ps(_mm_set1_ps(1.0f), _mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
{
texture = dpsoftrast.texbound[j];
if (texture)
{
+ __m128 mipedgetc;
if (texture->filter <= DPSOFTRAST_TEXTURE_FILTER_LINEAR)
{
mip[j] = 0;
continue;
}
k = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].lodarrayindex;
- mip_edge0tc[0] = (clipped[k][0][0] - clipped[k][1][0]) * texture->mipmap[0][2];
- mip_edge0tc[1] = (clipped[k][0][1] - clipped[k][1][1]) * texture->mipmap[0][3];
- mip_edge1tc[0] = (clipped[k][2][0] - clipped[k][1][0]) * texture->mipmap[0][2];
- mip_edge1tc[1] = (clipped[k][2][1] - clipped[k][1][1]) * texture->mipmap[0][3];
- mip_edge0mip = (mip_edge0tc[0]*mip_edge0tc[0]+mip_edge0tc[1]*mip_edge0tc[1]) * mip_edge0xymul;
- mip_edge1mip = (mip_edge1tc[0]*mip_edge1tc[0]+mip_edge1tc[1]*mip_edge1tc[1]) * mip_edge1xymul;
+ mipedgetc = _mm_sub_ps(_mm_shuffle_ps(clipped[k][0], clipped[k][2], _MM_SHUFFLE(1, 0, 1, 0)),
+ _mm_shuffle_ps(clipped[k][1], clipped[k][1], _MM_SHUFFLE(1, 0, 1, 0)));
+ mipedgetc = _mm_mul_ps(mipedgetc, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
+ mipedgetc = _mm_mul_ps(mipedgetc, mipedgetc);
+ mipedgetc = _mm_add_ps(mipedgetc, _mm_shuffle_ps(mipedgetc, mipedgetc, _MM_SHUFFLE(2, 3, 0, 1)));
+ mipedgetc = _mm_mul_ps(mipedgetc, mipedgescale);
+ mipedgetc = _mm_min_ss(mipedgetc, _mm_shuffle_ps(mipedgetc, mipedgetc, _MM_SHUFFLE(2, 2, 2, 2)));
// this will be multiplied in the texturing routine by the texture resolution
- mipdensity = mip_edge0mip < mip_edge1mip ? mip_edge0mip : mip_edge1mip;
- y = (int)(log(mipdensity)/log(2.0f));
- if (y < 0)
- y = 0;
- if (y > texture->mipmaps - 1)
- y = texture->mipmaps - 1;
+ y = _mm_cvtss_si32(mipedgetc);
+ if (y > 0)
+ {
+ y = (int)(log(y)/M_LN2);
+ if (y > texture->mipmaps - 1)
+ y = texture->mipmaps - 1;
+ }
+ else y = 0;
mip[j] = y;
}
}
// iterate potential spans
// TODO: optimize? if we figured out the edge order beforehand, this
- // could do loops over the edges in the proper order rather than
- // selecting them for each span
+ // could do loops over the edges in the proper order rather than
+ // selecting them for each span
// TODO: optimize? the edges could have data slopes calculated
// TODO: optimize? the data slopes could be calculated as a plane
- // (2D slopes) to avoid any interpolation along edges at all
- for (y = extent[1];y < extent[3];y++)
+ // (2D slopes) to avoid any interpolation along edges at all
+ for (y = starty+1;y < endy;)
{
- // get center of pixel y
- yc = y;
- // do the compares all at once
- screenyless[0] = y <= screeny[0];
- screenyless[1] = y <= screeny[1];
- screenyless[2] = y <= screeny[2];
- screenyless[3] = y <= screeny[3];
+ int nexty = -1;
+ __m128 edge0offset, edge1offset, edge0scale, edge1scale, data[DPSOFTRAST_ARRAY_TOTAL+1][2], slope[DPSOFTRAST_ARRAY_TOTAL+1][2];
+ __m128i screenycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
+ int screenymask = _mm_movemask_epi8(screenycc);
if (numpoints == 4)
{
- switch(screenyless[0] + screenyless[1] * 2 + screenyless[2] * 4 + screenyless[3] * 8)
+ switch(screenymask)
{
- case 0: /*0000*/ continue;
- case 1: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 0;edge1n = 1;break;
- case 2: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 1;edge1n = 2;break;
- case 3: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 2;break;
- case 4: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 2;edge1n = 3;break;
- case 5: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 2;edge1n = 3;break; // concave - nonsense
- case 6: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 3;break;
- case 7: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 3;break;
- case 8: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 3;edge1n = 0;break;
- case 9: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 1;break;
- case 10: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 2;break; // concave - nonsense
- case 11: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 2;break;
- case 12: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 0;break;
- case 13: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 1;break;
- case 14: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 0;break;
- case 15: /*1111*/ continue;
+ default:
+ case 0xFFFF: /*0000*/ y++; continue;
+ case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
+ case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
+ case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
+ case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
+ case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
+ case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
+ case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
+ case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
+ case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
+ case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 2;break; // concave - nonsense
+ case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
+ case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
+ case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
+ case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
+ case 0x0000: /*1111*/ y++; continue;
}
}
else
{
- switch(screenyless[0] + screenyless[1] * 2 + screenyless[2] * 4)
+ switch(screenymask)
{
- case 0: /*000*/ continue;
- case 1: /*100*/ edge0p = 2;edge0n = 0;edge1p = 0;edge1n = 1;break;
- case 2: /*010*/ edge0p = 0;edge0n = 1;edge1p = 1;edge1n = 2;break;
- case 3: /*110*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 2;break;
- case 4: /*001*/ edge0p = 1;edge0n = 2;edge1p = 2;edge1n = 0;break;
- case 5: /*101*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 1;break;
- case 6: /*011*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 0;break;
- case 7: /*111*/ continue;
+ default:
+ case 0xFFFF: /*000*/ y++; continue;
+ case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
+ case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
+ case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
+ case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
+ case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
+ case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
+ case 0x0000: /*111*/ y++; continue;
}
}
-#if 0
- {
- int foundedges = 0;
- int cedge0p = 0;
- int cedge0n = 0;
- int cedge1p = 0;
- int cedge1n = 0;
- for (j = 0, k = numpoints-1;j < numpoints;k = j, j++)
+ screenycc = _mm_max_epi16(_mm_srli_epi16(screenycc, 1), screeny);
+ screenycc = _mm_min_epi16(screenycc, _mm_shuffle_epi32(screenycc, _MM_SHUFFLE(1, 0, 3, 2)));
+ screenycc = _mm_min_epi16(screenycc, _mm_shuffle_epi32(screenycc, _MM_SHUFFLE(2, 3, 0, 1)));
+ nexty = _mm_extract_epi16(screenycc, 0);
+ if(nexty >= endy) nexty = endy-1;
+ if (_mm_ucomigt_ss(_mm_max_ss(screen[edge0n], screen[edge0p]), _mm_min_ss(screen[edge1n], screen[edge1p])))
{
- if (screenyless[k] && !screenyless[j])
- {
- cedge1p = k;
- cedge1n = j;
- foundedges |= 1;
- }
- else if (screenyless[j] && !screenyless[k])
+ int tmp = edge0n;
+ edge0n = edge1n;
+ edge1n = tmp;
+ tmp = edge0p;
+ edge0p = edge1p;
+ edge1p = tmp;
+ }
+ edge0offset = _mm_shuffle_ps(screen[edge0p], screen[edge0p], _MM_SHUFFLE(1, 1, 1, 1));
+ edge0scale = _mm_div_ss(_mm_set1_ps(1.0f), _mm_sub_ss(_mm_shuffle_ps(screen[edge0n], screen[edge0n], _MM_SHUFFLE(1, 1, 1, 1)), edge0offset));
+ edge0scale = _mm_shuffle_ps(edge0scale, edge0scale, _MM_SHUFFLE(0, 0, 0, 0));
+ edge0offset = _mm_sub_ps(_mm_set1_ps(y), edge0offset);
+ edge1offset = _mm_shuffle_ps(screen[edge1p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1));
+ edge1scale = _mm_div_ss(_mm_set1_ps(1.0f), _mm_sub_ss(_mm_shuffle_ps(screen[edge1n], screen[edge1n], _MM_SHUFFLE(1, 1, 1, 1)), edge1offset));
+ edge1offset = _mm_sub_ps(_mm_set1_ps(y), edge1offset);
+ edge1scale = _mm_shuffle_ps(edge1scale, edge1scale, _MM_SHUFFLE(0, 0, 0, 0));
+ j = DPSOFTRAST_ARRAY_TOTAL;
+ slope[j][0] = _mm_mul_ps(_mm_sub_ps(screen[edge0n], screen[edge0p]), edge0scale);
+ slope[j][1] = _mm_mul_ps(_mm_sub_ps(screen[edge1n], screen[edge1p]), edge1scale);
+ data[j][0] = _mm_add_ps(_mm_mul_ps(slope[j][0], edge0offset), screen[edge0p]);
+ data[j][1] = _mm_add_ps(_mm_mul_ps(slope[j][1], edge1offset), screen[edge1p]);
+ data[j][1] = _mm_sub_ps(data[j][1], data[j][0]);
+ slope[j][1] = _mm_sub_ps(slope[j][1], slope[j][0]);
+ for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL;j++)
+ {
+ //if (arraymask[j])
{
- cedge0p = k;
- cedge0n = j;
- foundedges |= 2;
+ slope[j][0] = _mm_mul_ps(_mm_sub_ps(proj[j][edge0n], proj[j][edge0p]), edge0scale);
+ slope[j][1] = _mm_mul_ps(_mm_sub_ps(proj[j][edge1n], proj[j][edge1p]), edge1scale);
+ data[j][0] = _mm_add_ps(_mm_mul_ps(slope[j][0], edge0offset), proj[j][edge0p]);
+ data[j][1] = _mm_add_ps(_mm_mul_ps(slope[j][1], edge1offset), proj[j][edge1p]);
+ data[j][1] = _mm_sub_ps(data[j][1], data[j][0]);
+ slope[j][1] = _mm_sub_ps(slope[j][1], slope[j][0]);
}
}
- if (foundedges != 3)
- continue;
- if (cedge0p != edge0p || cedge0n != edge0n || cedge1p != edge1p || cedge1n != edge1n)
+ goto firstspan;
+ for(; y <= nexty; y++)
{
- if (numpoints == 4)
- printf("case %i%i%i%i is broken %i %i %i %i != %i %i %i %i\n", screenyless[0], screenyless[1], screenyless[2], screenyless[3], cedge0p, cedge0n, cedge1p, cedge1n, edge0p, edge0n, edge1p, edge1n);
- else
- printf("case %i%i%i is broken %i %i %i %i != %i %i %i %i\n", screenyless[0], screenyless[1], screenyless[2], cedge0p, cedge0n, cedge1p, cedge1n, edge0p, edge0n, edge1p, edge1n);
- }
- }
-#endif
- edge0ylerp = (yc - screen[edge0p][1]) / (screen[edge0n][1] - screen[edge0p][1]);
- edge1ylerp = (yc - screen[edge1p][1]) / (screen[edge1n][1] - screen[edge1p][1]);
- if (edge0ylerp < 0 || edge0ylerp > 1 || edge1ylerp < 0 || edge1ylerp > 1)
- continue;
- edge0yilerp = 1.0f - edge0ylerp;
- edge1yilerp = 1.0f - edge1ylerp;
- edge0xf = screen[edge0p][0] * edge0yilerp + screen[edge0n][0] * edge0ylerp;
- edge1xf = screen[edge1p][0] * edge1yilerp + screen[edge1n][0] * edge1ylerp;
- if (edge0xf < edge1xf)
- {
- startxf = edge0xf;
- endxf = edge1xf;
- }
- else
- {
- startxf = edge1xf;
- endxf = edge0xf;
- }
- startx = (int)ceil(startxf);
- endx = (int)ceil(endxf);
- if (startx < 0)
- startx = 0;
- if (endx > width)
- endx = width;
- if (startx >= endx)
- continue;
- if (startxf > startx || endxf < endx-1) { printf("%s:%i X wrong (%i to %i is outside %f to %f)\n", __FILE__, __LINE__, startx, endx, startxf, endxf); }
- spanilength = 1.0f / (endxf - startxf);
- startxlerp = startx - startxf;
- span = &dpsoftrast.draw.spanqueue[dpsoftrast.draw.numspans++];
- memcpy(span->mip, mip, sizeof(span->mip));
- span->start = y * width + startx;
- span->length = endx - startx;
- j = DPSOFTRAST_ARRAY_TOTAL;
- if (edge0xf < edge1xf)
- {
- span->data[0][j][0] = screen[edge0p][0] * edge0yilerp + screen[edge0n][0] * edge0ylerp;
- span->data[0][j][1] = screen[edge0p][1] * edge0yilerp + screen[edge0n][1] * edge0ylerp;
- span->data[0][j][2] = screen[edge0p][2] * edge0yilerp + screen[edge0n][2] * edge0ylerp;
- span->data[0][j][3] = screen[edge0p][3] * edge0yilerp + screen[edge0n][3] * edge0ylerp;
- span->data[1][j][0] = screen[edge1p][0] * edge1yilerp + screen[edge1n][0] * edge1ylerp;
- span->data[1][j][1] = screen[edge1p][1] * edge1yilerp + screen[edge1n][1] * edge1ylerp;
- span->data[1][j][2] = screen[edge1p][2] * edge1yilerp + screen[edge1n][2] * edge1ylerp;
- span->data[1][j][3] = screen[edge1p][3] * edge1yilerp + screen[edge1n][3] * edge1ylerp;
+ __m128 data0, data1, spanilength, startxlerp;
+ j = DPSOFTRAST_ARRAY_TOTAL;
+ data[j][0] = _mm_add_ps(data[j][0], slope[j][0]);
+ data[j][1] = _mm_add_ps(data[j][1], slope[j][1]);
for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL;j++)
{
//if (arraymask[j])
{
- span->data[0][j][0] = proj[j][edge0p][0] * edge0yilerp + proj[j][edge0n][0] * edge0ylerp;
- span->data[0][j][1] = proj[j][edge0p][1] * edge0yilerp + proj[j][edge0n][1] * edge0ylerp;
- span->data[0][j][2] = proj[j][edge0p][2] * edge0yilerp + proj[j][edge0n][2] * edge0ylerp;
- span->data[0][j][3] = proj[j][edge0p][3] * edge0yilerp + proj[j][edge0n][3] * edge0ylerp;
- span->data[1][j][0] = proj[j][edge1p][0] * edge1yilerp + proj[j][edge1n][0] * edge1ylerp;
- span->data[1][j][1] = proj[j][edge1p][1] * edge1yilerp + proj[j][edge1n][1] * edge1ylerp;
- span->data[1][j][2] = proj[j][edge1p][2] * edge1yilerp + proj[j][edge1n][2] * edge1ylerp;
- span->data[1][j][3] = proj[j][edge1p][3] * edge1yilerp + proj[j][edge1n][3] * edge1ylerp;
+ data[j][0] = _mm_add_ps(data[j][0], slope[j][0]);
+ data[j][1] = _mm_add_ps(data[j][1], slope[j][1]);
}
}
- }
- else
- {
- span->data[0][j][0] = screen[edge1p][0] * edge1yilerp + screen[edge1n][0] * edge1ylerp;
- span->data[0][j][1] = screen[edge1p][1] * edge1yilerp + screen[edge1n][1] * edge1ylerp;
- span->data[0][j][2] = screen[edge1p][2] * edge1yilerp + screen[edge1n][2] * edge1ylerp;
- span->data[0][j][3] = screen[edge1p][3] * edge1yilerp + screen[edge1n][3] * edge1ylerp;
- span->data[1][j][0] = screen[edge0p][0] * edge0yilerp + screen[edge0n][0] * edge0ylerp;
- span->data[1][j][1] = screen[edge0p][1] * edge0yilerp + screen[edge0n][1] * edge0ylerp;
- span->data[1][j][2] = screen[edge0p][2] * edge0yilerp + screen[edge0n][2] * edge0ylerp;
- span->data[1][j][3] = screen[edge0p][3] * edge0yilerp + screen[edge0n][3] * edge0ylerp;
+
+ firstspan:
+ startx = _mm_cvtss_si32(_mm_add_ss(data[DPSOFTRAST_ARRAY_TOTAL][0], _mm_set1_ps(0.5f)));
+ endx = _mm_cvtss_si32(_mm_add_ss(_mm_add_ss(data[DPSOFTRAST_ARRAY_TOTAL][0], data[DPSOFTRAST_ARRAY_TOTAL][1]), _mm_set1_ps(0.5f)));
+ if (startx < 0) startx = 0;
+ if (endx > width) endx = width;
+ if (startx >= endx) continue;
+#if 0
+ _mm_store_ss(&startxf, data0);
+ _mm_store_ss(&endxf, data1);
+ if (startxf > startx || endxf < endx-1) { printf("%s:%i X wrong (%i to %i is outside %f to %f)\n", __FILE__, __LINE__, startx, endx, startxf, endxf); }
+#endif
+ spanilength = _mm_div_ss(_mm_set1_ps(1.0f), data[DPSOFTRAST_ARRAY_TOTAL][1]);
+ spanilength = _mm_shuffle_ps(spanilength, spanilength, _MM_SHUFFLE(0, 0, 0, 0));
+ startxlerp = _mm_sub_ss(_mm_cvtsi32_ss(_mm_setzero_ps(), startx), data[DPSOFTRAST_ARRAY_TOTAL][0]);
+ startxlerp = _mm_shuffle_ps(startxlerp, startxlerp, _MM_SHUFFLE(0, 0, 0, 0));
+ span = &dpsoftrast.draw.spanqueue[dpsoftrast.draw.numspans++];
+ memcpy(span->mip, mip, sizeof(span->mip));
+ span->start = y * width + startx;
+ span->length = endx - startx;
+ j = DPSOFTRAST_ARRAY_TOTAL;
+ data1 = _mm_mul_ps(data[j][1], spanilength);
+ data0 = _mm_add_ps(data[j][0], _mm_mul_ps(data1, startxlerp));
+ _mm_store_ps(span->data[0][j], data0);
+ _mm_store_ps(span->data[1][j], data1);
for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL;j++)
{
//if (arraymask[j])
{
- span->data[0][j][0] = proj[j][edge1p][0] * edge1yilerp + proj[j][edge1n][0] * edge1ylerp;
- span->data[0][j][1] = proj[j][edge1p][1] * edge1yilerp + proj[j][edge1n][1] * edge1ylerp;
- span->data[0][j][2] = proj[j][edge1p][2] * edge1yilerp + proj[j][edge1n][2] * edge1ylerp;
- span->data[0][j][3] = proj[j][edge1p][3] * edge1yilerp + proj[j][edge1n][3] * edge1ylerp;
- span->data[1][j][0] = proj[j][edge0p][0] * edge0yilerp + proj[j][edge0n][0] * edge0ylerp;
- span->data[1][j][1] = proj[j][edge0p][1] * edge0yilerp + proj[j][edge0n][1] * edge0ylerp;
- span->data[1][j][2] = proj[j][edge0p][2] * edge0yilerp + proj[j][edge0n][2] * edge0ylerp;
- span->data[1][j][3] = proj[j][edge0p][3] * edge0yilerp + proj[j][edge0n][3] * edge0ylerp;
+ data1 = _mm_mul_ps(data[j][1], spanilength);
+ data0 = _mm_add_ps(data[j][0], _mm_mul_ps(data1, startxlerp));
+ _mm_store_ps(span->data[0][j], data0);
+ _mm_store_ps(span->data[1][j], data1);
}
}
- }
- // change data[1][n][] to be a data slope
- j = DPSOFTRAST_ARRAY_TOTAL;
- span->data[1][j][0] = (span->data[1][j][0] - span->data[0][j][0]) * spanilength;
- span->data[1][j][1] = (span->data[1][j][1] - span->data[0][j][1]) * spanilength;
- span->data[1][j][2] = (span->data[1][j][2] - span->data[0][j][2]) * spanilength;
- span->data[1][j][3] = (span->data[1][j][3] - span->data[0][j][3]) * spanilength;
- for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL;j++)
- {
- //if (arraymask[j])
+ // to keep the shader routines from needing more than a small
+ // buffer for pixel intermediate data, we split long spans...
+ while (span->length > DPSOFTRAST_DRAW_MAXSPANLENGTH)
{
- span->data[1][j][0] = (span->data[1][j][0] - span->data[0][j][0]) * spanilength;
- span->data[1][j][1] = (span->data[1][j][1] - span->data[0][j][1]) * spanilength;
- span->data[1][j][2] = (span->data[1][j][2] - span->data[0][j][2]) * spanilength;
- span->data[1][j][3] = (span->data[1][j][3] - span->data[0][j][3]) * spanilength;
- }
- }
- // adjust the data[0][n][] to be correct for the pixel centers
- // this also handles horizontal clipping where a major part of the
- // span may be off the left side of the screen
- j = DPSOFTRAST_ARRAY_TOTAL;
- span->data[0][j][0] += span->data[1][j][0] * startxlerp;
- span->data[0][j][1] += span->data[1][j][1] * startxlerp;
- span->data[0][j][2] += span->data[1][j][2] * startxlerp;
- span->data[0][j][3] += span->data[1][j][3] * startxlerp;
- for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL;j++)
- {
- //if (arraymask[j])
- {
- span->data[0][j][0] += span->data[1][j][0] * startxlerp;
- span->data[0][j][1] += span->data[1][j][1] * startxlerp;
- span->data[0][j][2] += span->data[1][j][2] * startxlerp;
- span->data[0][j][3] += span->data[1][j][3] * startxlerp;
+ span->length = DPSOFTRAST_DRAW_MAXSPANLENGTH;
+ if (dpsoftrast.draw.numspans >= DPSOFTRAST_DRAW_MAXSPANQUEUE)
+ {
+ DPSOFTRAST_Draw_ProcessSpans();
+ dpsoftrast.draw.numspans = 0;
+ }
+ oldspan = span;
+ span = &dpsoftrast.draw.spanqueue[dpsoftrast.draw.numspans++];
+ *span = *oldspan;
+ startx += DPSOFTRAST_DRAW_MAXSPANLENGTH;
+ span->start = y * width + startx;
+ span->length = endx - startx;
+ j = DPSOFTRAST_ARRAY_TOTAL;
+ _mm_store_ps(span->data[0][j], _mm_add_ps(_mm_load_ps(span->data[0][j]), _mm_mul_ps(_mm_load_ps(span->data[1][j]), _mm_set1_ps(DPSOFTRAST_DRAW_MAXSPANLENGTH))));
+ for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL;j++)
+ {
+ //if (arraymask[j])
+ {
+ _mm_store_ps(span->data[0][j], _mm_add_ps(_mm_load_ps(span->data[0][j]), _mm_mul_ps(_mm_load_ps(span->data[1][j]), _mm_set1_ps(DPSOFTRAST_DRAW_MAXSPANLENGTH))));
+ }
+ }
}
- }
- // to keep the shader routines from needing more than a small
- // buffer for pixel intermediate data, we split long spans...
- while (span->length > DPSOFTRAST_DRAW_MAXSPANLENGTH)
- {
- span->length = DPSOFTRAST_DRAW_MAXSPANLENGTH;
+ // after all that, we have a span suitable for the pixel shader...
if (dpsoftrast.draw.numspans >= DPSOFTRAST_DRAW_MAXSPANQUEUE)
{
DPSOFTRAST_Draw_ProcessSpans();
dpsoftrast.draw.numspans = 0;
}
- oldspan = span;
- span = &dpsoftrast.draw.spanqueue[dpsoftrast.draw.numspans++];
- *span = *oldspan;
- startx += DPSOFTRAST_DRAW_MAXSPANLENGTH;
- span->start = y * width + startx;
- span->length = endx - startx;
- j = DPSOFTRAST_ARRAY_TOTAL;
- span->data[0][j][0] += span->data[1][j][0] * DPSOFTRAST_DRAW_MAXSPANLENGTH;
- span->data[0][j][1] += span->data[1][j][1] * DPSOFTRAST_DRAW_MAXSPANLENGTH;
- span->data[0][j][2] += span->data[1][j][2] * DPSOFTRAST_DRAW_MAXSPANLENGTH;
- span->data[0][j][3] += span->data[1][j][3] * DPSOFTRAST_DRAW_MAXSPANLENGTH;
- for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL;j++)
- {
- //if (arraymask[j])
- {
- span->data[0][j][0] += span->data[1][j][0] * DPSOFTRAST_DRAW_MAXSPANLENGTH;
- span->data[0][j][1] += span->data[1][j][1] * DPSOFTRAST_DRAW_MAXSPANLENGTH;
- span->data[0][j][2] += span->data[1][j][2] * DPSOFTRAST_DRAW_MAXSPANLENGTH;
- span->data[0][j][3] += span->data[1][j][3] * DPSOFTRAST_DRAW_MAXSPANLENGTH;
- }
- }
- }
- // after all that, we have a span suitable for the pixel shader...
- if (dpsoftrast.draw.numspans >= DPSOFTRAST_DRAW_MAXSPANQUEUE)
- {
- DPSOFTRAST_Draw_ProcessSpans();
- dpsoftrast.draw.numspans = 0;
}
}
// draw outlines over triangle for debugging
DPSOFTRAST_Draw_ProcessSpans();
dpsoftrast.draw.numspans = 0;
}
+#endif
}
void DPSOFTRAST_Draw_DebugPoints(void)
int i;
for (i = 0;i < dpsoftrast.texture_end;i++)
if (dpsoftrast.texture[i].bytes)
- free(dpsoftrast.texture[i].bytes);
+ MM_FREE(dpsoftrast.texture[i].bytes);
if (dpsoftrast.texture)
free(dpsoftrast.texture);
memset(&dpsoftrast, 0, sizeof(dpsoftrast));