]> git.xonotic.org Git - xonotic/darkplaces.git/blobdiff - dpsoftrast.c
FinishBGRA8 optimization and fixes
[xonotic/darkplaces.git] / dpsoftrast.c
index 98841a62ecfdf41bb2afeeb6191f3f48334a9f00..3b0ec6e02c127a39a5e74da98b8ea70068105d36 100644 (file)
@@ -188,11 +188,14 @@ typedef ALIGN(struct DPSOFTRAST_State_Span_s
        int startx; // usable range (according to pixelmask)
        int endx; // usable range (according to pixelmask)
        unsigned char *pixelmask; // true for pixels that passed depth test, false for others
+       int depthbase; // depthbuffer value at x (add depthslope*startx to get first pixel's depthbuffer value)
+       int depthslope; // depthbuffer value pixel delta
 }
 DPSOFTRAST_State_Span);
 
 #define DPSOFTRAST_DRAW_MAXSPANS 1024
 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
+#define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
 
 #define DPSOFTRAST_VALIDATE_FB 1
 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
@@ -235,6 +238,8 @@ typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
        int scissor[4];
        float depthrange[2];
        float polygonoffset[2];
+       float clipplane[4];
+       ALIGN(float fb_clipplane[4]);
 
        int shader_mode;
        int shader_permutation;
@@ -278,6 +283,7 @@ typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
        int numtriangles;
        DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
        DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
+       unsigned char pixelmaskarray[DPSOFTRAST_DRAW_MAXSPANLENGTH+4]; // LordHavoc: padded to allow some termination bytes
 }
 DPSOFTRAST_State_Thread);
 
@@ -345,7 +351,9 @@ DPSOFTRAST_State dpsoftrast;
 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
-#define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
+
+static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span);
+static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span);
 
 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
 {
@@ -375,6 +383,15 @@ static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
        }
 }
 
+static void DPSOFTRAST_RecalcClipPlane(DPSOFTRAST_State_Thread *thread)
+{
+       thread->fb_clipplane[0] = thread->clipplane[0] / thread->fb_viewportscale[1];
+       thread->fb_clipplane[1] = thread->clipplane[1] / thread->fb_viewportscale[2];
+       thread->fb_clipplane[2] = thread->clipplane[2] / thread->fb_viewportscale[3];
+       thread->fb_clipplane[3] = thread->clipplane[3] / thread->fb_viewportscale[0];
+       thread->fb_clipplane[3] -= thread->fb_viewportcenter[1]*thread->fb_clipplane[0] + thread->fb_viewportcenter[2]*thread->fb_clipplane[1] + thread->fb_viewportcenter[3]*thread->fb_clipplane[2] + thread->fb_viewportcenter[0]*thread->fb_clipplane[3];
+}
+
 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
 {
        // calculate framebuffer scissor, viewport, viewport clipped by scissor,
@@ -396,6 +413,7 @@ static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
        thread->fb_scissor[3] = y2 - y1;
 
        DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
+       DPSOFTRAST_RecalcClipPlane(thread);
        DPSOFTRAST_RecalcThread(thread);
 }
 
@@ -1435,6 +1453,21 @@ void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
        dpsoftrast.uniform1i[command->index] = i0;
 }
 
+DEFCOMMAND(24, ClipPlane, float clipplane[4];)
+static void DPSOFTRAST_Interpret_ClipPlane(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClipPlane *command)
+{
+       memcpy(thread->clipplane, command->clipplane, 4*sizeof(float));
+       thread->validate |= DPSOFTRAST_VALIDATE_FB;
+}
+void DPSOFTRAST_ClipPlane(float x, float y, float z, float w)
+{
+       DPSOFTRAST_Command_ClipPlane *command = DPSOFTRAST_ALLOCATECOMMAND(ClipPlane);
+       command->clipplane[0] = x;
+       command->clipplane[1] = y;
+       command->clipplane[2] = z;
+       command->clipplane[3] = w;
+}
+
 #ifdef SSE_POSSIBLE
 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
 {
@@ -1989,6 +2022,13 @@ void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAS
        float wslope = triangle->w[0];
        float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
        float endz = 1.0f / (w + wslope * startx);
+       if (triangle->w[0] == 0)
+       {
+               // LordHavoc: fast flat polygons (HUD/menu)
+               for (x = startx;x < endx;x++)
+                       zf[x] = endz;
+               return;
+       }
        for (x = startx;x < endx;)
        {
                int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
@@ -2001,323 +2041,237 @@ void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAS
        }
 }
 
-void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
+void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
 {
+#ifdef SSE_POSSIBLE
        int x;
        int startx = span->startx;
        int endx = span->endx;
-       int d[4];
-       float a, b;
+       int subx;
+       const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
        unsigned char * RESTRICT pixelmask = span->pixelmask;
        unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
+       unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
        if (!pixel)
                return;
        pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
+       pixeli += span->y * dpsoftrast.fb_width + span->x;
        // handle alphatest now (this affects depth writes too)
        if (thread->alphatest)
                for (x = startx;x < endx;x++)
-                       if (in4f[x*4+3] < 0.5f)
+                       if (in4ub[x*4+3] < 128)
                                pixelmask[x] = false;
-       // FIXME: this does not handle bigendian
+       // LordHavoc: clear pixelmask for some pixels in alphablend cases, this
+       // helps sprites, text and hud artwork
        switch(thread->fb_blendmode)
        {
-       case DPSOFTRAST_BLENDMODE_OPAQUE:
-               for (x = startx;x < endx;x++)
-               {
-                       if (!pixelmask[x])
-                               continue;
-                       d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
-                       d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
-                       d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
-                       d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
-                       pixel[x*4+0] = d[0];
-                       pixel[x*4+1] = d[1];
-                       pixel[x*4+2] = d[2];
-                       pixel[x*4+3] = d[3];
-               }
-               break;
        case DPSOFTRAST_BLENDMODE_ALPHA:
-               for (x = startx;x < endx;x++)
-               {
-                       if (!pixelmask[x])
-                               continue;
-                       a = in4f[x*4+3] * 255.0f;
-                       b = 1.0f - in4f[x*4+3];
-                       d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
-                       d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
-                       d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
-                       d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
-                       pixel[x*4+0] = d[0];
-                       pixel[x*4+1] = d[1];
-                       pixel[x*4+2] = d[2];
-                       pixel[x*4+3] = d[3];
-               }
-               break;
        case DPSOFTRAST_BLENDMODE_ADDALPHA:
+       case DPSOFTRAST_BLENDMODE_SUBALPHA:
                for (x = startx;x < endx;x++)
-               {
-                       if (!pixelmask[x])
-                               continue;
-                       a = in4f[x*4+3] * 255.0f;
-                       d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
-                       d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
-                       d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
-                       d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
-                       pixel[x*4+0] = d[0];
-                       pixel[x*4+1] = d[1];
-                       pixel[x*4+2] = d[2];
-                       pixel[x*4+3] = d[3];
-               }
+                       if (in4ub[x*4+3] < 1)
+                               pixelmask[x] = false;
                break;
+       case DPSOFTRAST_BLENDMODE_OPAQUE:
        case DPSOFTRAST_BLENDMODE_ADD:
-               for (x = startx;x < endx;x++)
-               {
-                       if (!pixelmask[x])
-                               continue;
-                       d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
-                       d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
-                       d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
-                       d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
-                       pixel[x*4+0] = d[0];
-                       pixel[x*4+1] = d[1];
-                       pixel[x*4+2] = d[2];
-                       pixel[x*4+3] = d[3];
-               }
-               break;
        case DPSOFTRAST_BLENDMODE_INVMOD:
-               for (x = startx;x < endx;x++)
-               {
-                       if (!pixelmask[x])
-                               continue;
-                       d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
-                       d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
-                       d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
-                       d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
-                       pixel[x*4+0] = d[0];
-                       pixel[x*4+1] = d[1];
-                       pixel[x*4+2] = d[2];
-                       pixel[x*4+3] = d[3];
-               }
-               break;
        case DPSOFTRAST_BLENDMODE_MUL:
-               for (x = startx;x < endx;x++)
-               {
-                       if (!pixelmask[x])
-                               continue;
-                       d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
-                       d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
-                       d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
-                       d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
-                       pixel[x*4+0] = d[0];
-                       pixel[x*4+1] = d[1];
-                       pixel[x*4+2] = d[2];
-                       pixel[x*4+3] = d[3];
-               }
-               break;
        case DPSOFTRAST_BLENDMODE_MUL2:
-               for (x = startx;x < endx;x++)
-               {
-                       if (!pixelmask[x])
-                               continue;
-                       d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
-                       d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
-                       d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
-                       d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
-                       pixel[x*4+0] = d[0];
-                       pixel[x*4+1] = d[1];
-                       pixel[x*4+2] = d[2];
-                       pixel[x*4+3] = d[3];
-               }
-               break;
-       case DPSOFTRAST_BLENDMODE_SUBALPHA:
-               for (x = startx;x < endx;x++)
-               {
-                       if (!pixelmask[x])
-                               continue;
-                       a = in4f[x*4+3] * -255.0f;
-                       d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
-                       d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
-                       d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
-                       d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
-                       pixel[x*4+0] = d[0];
-                       pixel[x*4+1] = d[1];
-                       pixel[x*4+2] = d[2];
-                       pixel[x*4+3] = d[3];
-               }
-               break;
        case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
-               for (x = startx;x < endx;x++)
-               {
-                       if (!pixelmask[x])
-                               continue;
-                       a = 255.0f;
-                       b = 1.0f - in4f[x*4+3];
-                       d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
-                       d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
-                       d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
-                       d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
-                       pixel[x*4+0] = d[0];
-                       pixel[x*4+1] = d[1];
-                       pixel[x*4+2] = d[2];
-                       pixel[x*4+3] = d[3];
-               }
-               break;
        case DPSOFTRAST_BLENDMODE_INVADD:
-               for (x = startx;x < endx;x++)
-               {
-                       if (!pixelmask[x])
-                               continue;
-                       d[0] = (int)((255.0f-pixel[x*4+2])*in4f[x*4+0] + pixel[x*4+2]);if (d[0] > 255) d[0] = 255;
-                       d[1] = (int)((255.0f-pixel[x*4+1])*in4f[x*4+1] + pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
-                       d[2] = (int)((255.0f-pixel[x*4+0])*in4f[x*4+2] + pixel[x*4+0]);if (d[2] > 255) d[2] = 255;
-                       d[3] = (int)((255.0f-pixel[x*4+3])*in4f[x*4+3] + pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
-                       pixel[x*4+0] = d[0];
-                       pixel[x*4+1] = d[1];
-                       pixel[x*4+2] = d[2];
-                       pixel[x*4+3] = d[3];
-               }
                break;
        }
-}
-
-void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
-{
-#ifdef SSE_POSSIBLE
-       int x;
-       int startx = span->startx;
-       int endx = span->endx;
-       const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
-       unsigned char * RESTRICT pixelmask = span->pixelmask;
-       unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
-       unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
-       if (!pixel)
-               return;
-       pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
-       pixeli += span->y * dpsoftrast.fb_width + span->x;
-       // handle alphatest now (this affects depth writes too)
-       if (thread->alphatest)
-               for (x = startx;x < endx;x++)
-                       if (in4ub[x*4+3] < 0.5f)
-                               pixelmask[x] = false;
-       // FIXME: this does not handle bigendian
-       switch(thread->fb_blendmode)
+       // put some special values at the end of the mask to ensure the loops end
+       pixelmask[endx] = 1;
+       pixelmask[endx+1] = 0;
+       // LordHavoc: use a double loop to identify subspans, this helps the
+       // optimized copy/blend loops to perform at their best, most triangles
+       // have only one run of pixels, and do the search using wide reads...
+       x = startx;
+       while (x < endx)
        {
-       case DPSOFTRAST_BLENDMODE_OPAQUE:
-               for (x = startx;x + 4 <= endx;)
+               // if this pixel is masked off, it's probably not alone...
+               if (!pixelmask[x])
                {
-                       if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
+                       x++;
+#if 1
+                       if (x + 8 < endx)
                        {
-                               _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
-                               x += 4;
+                               // the 4-item search must be aligned or else it stalls badly
+                               if ((x & 3) && !pixelmask[x]) 
+                               {
+                                       if(pixelmask[x]) goto endmasked;
+                                       x++;
+                                       if (x & 3)
+                                       {
+                                               if(pixelmask[x]) goto endmasked;
+                                               x++;
+                                               if (x & 3)
+                                               {
+                                                       if(pixelmask[x]) goto endmasked;
+                                                       x++;
+                                               }
+                                       }
+                               }
+                               while (*(unsigned int *)&pixelmask[x] == 0x00000000)
+                                       x += 4;
+                       }
+#endif
+                       for (;!pixelmask[x];x++)
+                               ;
+                       // rather than continue the loop, just check the end variable
+                       if (x >= endx)
+                               break;
+               }
+       endmasked:
+               // find length of subspan
+               subx = x + 1;
+#if 1
+               if (subx + 8 < endx)
+               {
+                       if (subx & 3)
+                       {
+                               if(!pixelmask[subx]) goto endunmasked;
+                               subx++;
+                               if (subx & 3)
+                               {
+                                       if(!pixelmask[subx]) goto endunmasked;
+                                       subx++;
+                                       if (subx & 3)
+                                       {
+                                               if(!pixelmask[subx]) goto endunmasked;
+                                               subx++;
+                                       }
+                               }
+                       }
+                       while (*(unsigned int *)&pixelmask[subx] == 0x01010101)
+                               subx += 4;
+               }
+#endif
+               for (;pixelmask[subx];subx++)
+                       ;
+               // the checks can overshoot, so make sure to clip it...
+               if (subx > endx)
+                       subx = endx;
+       endunmasked:
+               // now that we know the subspan length...  process!
+               switch(thread->fb_blendmode)
+               {
+               case DPSOFTRAST_BLENDMODE_OPAQUE:
+#if 0
+                       if (subx - x >= 16)
+                       {
+                               memcpy(pixeli + x, ini + x, (subx - x) * sizeof(pixeli[x]));
+                               x = subx;
                        }
                        else
+#elif 1
+                       while (x + 16 <= subx)
+                       {
+                               _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
+                               _mm_storeu_si128((__m128i *)&pixeli[x+4], _mm_loadu_si128((const __m128i *)&ini[x+4]));
+                               _mm_storeu_si128((__m128i *)&pixeli[x+8], _mm_loadu_si128((const __m128i *)&ini[x+8]));
+                               _mm_storeu_si128((__m128i *)&pixeli[x+12], _mm_loadu_si128((const __m128i *)&ini[x+12]));
+                               x += 16;
+                       }
+#endif
                        {
-                               if (pixelmask[x])
+                               while (x + 4 <= subx)
+                               {
+                                       _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
+                                       x += 4;
+                               }
+                               if (x + 2 <= subx)
+                               {
+                                       pixeli[x] = ini[x];
+                                       pixeli[x+1] = ini[x+1];
+                                       x += 2;
+                               }
+                               if (x < subx)
+                               {
                                        pixeli[x] = ini[x];
-                               x++;
+                                       x++;
+                               }
                        }
-               }
-               for (;x < endx;x++)
-                       if (pixelmask[x])
-                               pixeli[x] = ini[x];
-               break;
-       case DPSOFTRAST_BLENDMODE_ALPHA:
-       #define FINISHBLEND(blend2, blend1) \
-               for (x = startx;x + 1 < endx;x += 2) \
-               { \
-                       __m128i src, dst; \
-                       switch (*(const unsigned short*)&pixelmask[x]) \
+                       break;
+               case DPSOFTRAST_BLENDMODE_ALPHA:
+               #define FINISHBLEND(blend2, blend1) \
+                       for (;x + 1 < subx;x += 2) \
                        { \
-                       case 0x0101: \
+                               __m128i src, dst; \
                                src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
                                dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
                                blend2; \
                                _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
-                               continue; \
-                       case 0x0100: \
-                               src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
-                               dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
-                               blend1; \
-                               pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst));  \
-                               continue; \
-                       case 0x0001: \
+                       } \
+                       if (x < subx) \
+                       { \
+                               __m128i src, dst; \
                                src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
                                dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
                                blend1; \
                                pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
-                               continue; \
-                       } \
-                       break; \
-               } \
-               for(;x < endx; x++) \
-               { \
-                       __m128i src, dst; \
-                       if (!pixelmask[x]) \
-                               continue; \
-                       src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
-                       dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
-                       blend1; \
-                       pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
+                               x++; \
+                       }
+                       FINISHBLEND({
+                               __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
+                               dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
+                       }, {
+                               __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
+                               dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
+                       });
+                       break;
+               case DPSOFTRAST_BLENDMODE_ADDALPHA:
+                       FINISHBLEND({
+                               __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
+                               dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
+                       }, {
+                               __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
+                               dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
+                       });
+                       break;
+               case DPSOFTRAST_BLENDMODE_ADD:
+                       FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
+                       break;
+               case DPSOFTRAST_BLENDMODE_INVMOD:
+                       FINISHBLEND({
+                               dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
+                       }, {
+                               dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
+                       });
+                       break;
+               case DPSOFTRAST_BLENDMODE_MUL:
+                       FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
+                       break;
+               case DPSOFTRAST_BLENDMODE_MUL2:
+                       FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
+                       break;
+               case DPSOFTRAST_BLENDMODE_SUBALPHA:
+                       FINISHBLEND({
+                               __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
+                               dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
+                       }, {
+                               __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
+                               dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
+                       });
+                       break;
+               case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
+                       FINISHBLEND({
+                               __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
+                               dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
+                       }, {
+                               __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
+                               dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
+                       });
+                       break;
+               case DPSOFTRAST_BLENDMODE_INVADD:
+                       FINISHBLEND({
+                               dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
+                       }, {
+                               dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
+                       });
+                       break;
                }
-
-               FINISHBLEND({
-                       __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
-                       dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
-               }, {
-                       __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
-                       dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
-               });
-               break;
-       case DPSOFTRAST_BLENDMODE_ADDALPHA:
-               FINISHBLEND({
-                       __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
-                       dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
-               }, {
-                       __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
-                       dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
-               });
-               break;
-       case DPSOFTRAST_BLENDMODE_ADD:
-               FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
-               break;
-       case DPSOFTRAST_BLENDMODE_INVMOD:
-               FINISHBLEND({
-                       dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
-               }, {
-                       dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
-               });
-               break;
-       case DPSOFTRAST_BLENDMODE_MUL:
-               FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
-               break;
-       case DPSOFTRAST_BLENDMODE_MUL2:
-               FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
-               break;
-       case DPSOFTRAST_BLENDMODE_SUBALPHA:
-               FINISHBLEND({
-                       __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
-                       dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
-               }, {
-                       __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
-                       dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
-               });
-               break;
-       case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
-               FINISHBLEND({
-                       __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
-                       dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
-               }, {
-                       __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
-                       dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
-               });
-               break;
-       case DPSOFTRAST_BLENDMODE_INVADD:
-               FINISHBLEND({
-                       dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
-               }, {
-                       dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
-               });
-               break;
        }
 #endif
 }
@@ -2386,8 +2340,13 @@ void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, cons
        tcimax[1] = texture->mipmap[mip][3]-1;
        tciwrapmask[0] = texture->mipmap[mip][2]-1;
        tciwrapmask[1] = texture->mipmap[mip][3]-1;
-       endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
-       endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
+       endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
+       endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
+       if (filter)
+       {
+               endtc[0] -= 0.5f;
+               endtc[1] -= 0.5f;
+       }
        for (x = startx;x < endx;)
        {
                unsigned int subtc[2];
@@ -2401,8 +2360,13 @@ void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, cons
                }
                tc[0] = endtc[0];
                tc[1] = endtc[1];
-               endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
-               endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
+               endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
+               endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
+               if (filter)
+               {
+                       endtc[0] -= 0.5f;
+                       endtc[1] -= 0.5f;
+               }
                substep[0] = (endtc[0] - tc[0]) * subscale;
                substep[1] = (endtc[1] - tc[1]) * subscale;
                subtc[0] = tc[0] * (1<<12);
@@ -2522,6 +2486,7 @@ void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread,
        __m128i subtc, substep, endsubtc;
        int filter;
        int mip;
+       int affine; // LordHavoc: optimized affine texturing case
        unsigned int * RESTRICT outi = (unsigned int *)out4ub;
        const unsigned char * RESTRICT pixelbase;
        DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
@@ -2541,6 +2506,7 @@ void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread,
                        outi[x] = k;
                return;
        }
+       affine = zf[startx] == zf[endx-1];
        filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
        DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
        flags = texture->flags;
@@ -2549,7 +2515,9 @@ void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread,
        tcscale = _mm_cvtepi32_ps(tcsize);
        data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
        slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
-       endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
+       endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
+       if (filter)
+               endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
        endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
        tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
        tcmax = _mm_packs_epi32(tcmask, tcmask);
@@ -2557,14 +2525,16 @@ void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread,
        {
                int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
                __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
-               if (nextsub >= endx)
+               if (nextsub >= endx || affine)
                {
                        nextsub = endsub = endx-1;
                        if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
                }       
                tc = endtc;
                subtc = endsubtc;
-               endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
+               endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
+               if (filter)
+                       endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
                substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
                endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
                subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
@@ -4439,7 +4409,7 @@ void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DP
 
                // "    vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
                iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
-        
+               
                // "    vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
                SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
                SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
@@ -4475,7 +4445,7 @@ void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DP
                }
                else
                {
-                       int tci[2] = { ScreenTexCoord[0] * texture->mipmap[0][2] - 0.5f, ScreenTexCoord[1] * texture->mipmap[0][3] - 0.5f };
+                       int tci[2] = { ScreenTexCoord[0] * texture->mipmap[0][2], ScreenTexCoord[1] * texture->mipmap[0][3] };
                        tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
                        tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
                        pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
@@ -4595,81 +4565,89 @@ static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COU
        {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}},
 };
 
-void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
+static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span)
 {
-       int i;
        int x;
        int startx;
        int endx;
-//     unsigned int c;
-//     unsigned int *colorpixel;
        unsigned int *depthpixel;
-       float w;
-       float wslope;
        int depth;
        int depthslope;
        unsigned int d;
+       unsigned char *pixelmask;
+       DPSOFTRAST_State_Triangle *triangle;
+       triangle = &thread->triangles[span->triangle];
+       depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
+       startx = span->startx;
+       endx = span->endx;
+       depth = span->depthbase;
+       depthslope = span->depthslope;
+       pixelmask = thread->pixelmaskarray;
+       if (thread->depthtest && dpsoftrast.fb_depthpixels)
+       {
+               switch(thread->fb_depthfunc)
+               {
+               default:
+               case GL_ALWAYS:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
+               case GL_LESS:    for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
+               case GL_LEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
+               case GL_EQUAL:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
+               case GL_GEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
+               case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
+               case GL_NEVER:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
+               }
+               while (startx < endx && !pixelmask[startx])
+                       startx++;
+               while (endx > startx && !pixelmask[endx-1])
+                       endx--;
+       }
+       else
+       {
+               // no depth testing means we're just dealing with color...
+               memset(pixelmask + startx, 1, endx - startx);
+       }
+       span->pixelmask = pixelmask;
+       span->startx = startx;
+       span->endx = endx;
+}
+
+static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span)
+{
+       int x, d, depth, depthslope, startx, endx;
+       const unsigned char *pixelmask;
+       unsigned int *depthpixel;
+       if (thread->depthmask && thread->depthtest && dpsoftrast.fb_depthpixels)
+       {
+               depth = span->depthbase;
+               depthslope = span->depthslope;
+               pixelmask = span->pixelmask;
+               startx = span->startx;
+               endx = span->endx;
+               depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
+               for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
+                       if (pixelmask[x])
+                               depthpixel[x] = d;
+       }
+}
+
+void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
+{
+       int i;
        DPSOFTRAST_State_Triangle *triangle;
        DPSOFTRAST_State_Span *span;
-       unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
        for (i = 0; i < thread->numspans; i++)
        {
                span = &thread->spans[i];
                triangle = &thread->triangles[span->triangle];
-               if (thread->depthtest && dpsoftrast.fb_depthpixels)
-               {
-                       wslope = triangle->w[0];
-                       w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
-                       depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
-                       depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
-                       depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
-                       startx = span->startx;
-                       endx = span->endx;
-                       switch(thread->fb_depthfunc)
-                       {
-                       default:
-                       case GL_ALWAYS:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
-                       case GL_LESS:    for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
-                       case GL_LEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
-                       case GL_EQUAL:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
-                       case GL_GEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
-                       case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
-                       case GL_NEVER:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
-                       }
-                       //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
-                       //for (x = startx;x < endx;x++)
-                       //      colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
-                       // if there is no color buffer, skip pixel shader
-                       while (startx < endx && !pixelmask[startx])
-                               startx++;
-                       while (endx > startx && !pixelmask[endx-1])
-                               endx--;
-                       if (startx >= endx)
-                               continue; // no pixels to fill
-                       span->pixelmask = pixelmask;
-                       span->startx = startx;
-                       span->endx = endx;
-                       // run pixel shader if appropriate
-                       // do this before running depthmask code, to allow the pixelshader
-                       // to clear pixelmask values for alpha testing
-                       if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
-                               DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
-                       if (thread->depthmask)
-                               for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
-                                       if (pixelmask[x])
-                                               depthpixel[x] = d;
-               }
-               else
-               {
-                       // no depth testing means we're just dealing with color...
-                       // if there is no color buffer, skip pixel shader
-                       if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
-                       {
-                               memset(pixelmask + span->startx, 1, span->endx - span->startx);
-                               span->pixelmask = pixelmask;
-                               DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
-                       }
-               }
+               DPSOFTRAST_Draw_DepthTest(thread, span);
+               if (span->startx >= span->endx)
+                       continue;
+               // run pixel shader if appropriate
+               // do this before running depthmask code, to allow the pixelshader
+               // to clear pixelmask values for alpha testing
+               if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
+                       DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
+               DPSOFTRAST_Draw_DepthWrite(thread, span);
        }
        thread->numspans = 0;
 }
@@ -4700,6 +4678,8 @@ static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAS
        int numpoints;
        int clipcase;
        float clipdist[4];
+       float clip0origin, clip0slope;
+       int clip0dir;
        __m128 triangleedge1, triangleedge2, trianglenormal;
        __m128 clipfrac[3];
        __m128 screen[4];
@@ -4924,6 +4904,43 @@ static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAS
                        _mm_store_ss(&triangle->w[0], attribxslope);
                        _mm_store_ss(&triangle->w[1], attribyslope);
                        _mm_store_ss(&triangle->w[2], attriborigin);
+                       
+                       clip0origin = 0;
+                       clip0slope = 0;
+                       clip0dir = 0;
+                       if(thread->fb_clipplane[0] || thread->fb_clipplane[1] || thread->fb_clipplane[2])
+                       {
+                               float cliporigin, clipxslope, clipyslope;
+                               attriborigin = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(2, 2, 2, 2));
+                               attribedge1 = _mm_sub_ss(_mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
+                               attribedge2 = _mm_sub_ss(_mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
+                               attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
+                               attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
+                               attriborigin = _mm_sub_ss(attriborigin, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
+                               cliporigin = _mm_cvtss_f32(attriborigin)*thread->fb_clipplane[2] + thread->fb_clipplane[3];
+                               clipxslope = thread->fb_clipplane[0] + _mm_cvtss_f32(attribxslope)*thread->fb_clipplane[2];
+                               clipyslope = thread->fb_clipplane[1] + _mm_cvtss_f32(attribyslope)*thread->fb_clipplane[2];
+                               if(clipxslope != 0)
+                               {
+                                       clip0origin = -cliporigin/clipxslope;
+                                       clip0slope = -clipyslope/clipxslope;
+                                       clip0dir = clipxslope > 0 ? 1 : -1;
+                               }
+                               else if(clipyslope > 0)
+                               {
+                                       clip0origin = dpsoftrast.fb_width*floor(cliporigin/clipyslope);
+                                       clip0slope = dpsoftrast.fb_width;
+                                       clip0dir = -1;
+                               }
+                               else if(clipyslope < 0)
+                               {
+                                       clip0origin = dpsoftrast.fb_width*ceil(cliporigin/clipyslope);
+                                       clip0slope = -dpsoftrast.fb_width;
+                                       clip0dir = -1;
+                               }
+                               else if(clip0origin < 0) continue;
+                       }
+
                        mipedgescale = _mm_setzero_ps();
                        for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
                        {
@@ -4985,6 +5002,8 @@ static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAS
                        int yccmask = _mm_movemask_epi8(ycc);
                        int edge0p, edge0n, edge1p, edge1n;
                        int nexty;
+                       float w, wslope;
+                       float clip0;
                        if (numpoints == 4)
                        {
                                switch(yccmask)
@@ -5038,28 +5057,47 @@ static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAS
                                xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
                                xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
                        }
-                       for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
+                       clip0 = clip0origin + (y+0.5f)*clip0slope + 0.5f;
+                       for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope), clip0 += clip0slope)
                        {
                                int startx, endx, offset;
                                startx = _mm_cvtss_si32(xcoords);
                                endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
-                               if (startx < minx) 
-                               {
-                                       if (startx < 0) startx = 0;
-                                       startx += (minx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
-                               }
+                               if (startx < minx) startx = minx;
                                if (endx > maxx) endx = maxx;
                                if (startx >= endx) continue;
+
+                               if (clip0dir)
+                               {
+                                       if (clip0dir > 0)
+                                       {
+                                               if (startx < clip0) 
+                                               {
+                                                       if(endx <= clip0) continue;
+                                                       startx = (int)clip0;
+                                               }
+                                       }
+                                       else if (endx > clip0) 
+                                       {
+                                               if(startx >= clip0) continue;
+                                               endx = (int)clip0;
+                                       }
+                               }
+                                               
                                for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
                                {
                                        DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
                                        span->triangle = thread->numtriangles;
                                        span->x = offset;
                                        span->y = y;
-                                       span->startx = max(minx - offset, 0);
+                                       span->startx = 0;
                                        span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
                                        if (span->startx >= span->endx)
-                                               continue; 
+                                               continue;
+                                       wslope = triangle->w[0];
+                                       w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
+                                       span->depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
+                                       span->depthbase = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
                                        if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
                                                DPSOFTRAST_Draw_ProcessSpans(thread);
                                }
@@ -5246,6 +5284,7 @@ static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, i
                INTERPCOMMAND(UniformMatrix4f)
                INTERPCOMMAND(Uniform1i)
                INTERPCOMMAND(SetRenderTargets)
+               INTERPCOMMAND(ClipPlane)
 
                case DPSOFTRAST_OPCODE_Draw:
                        DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
@@ -5384,6 +5423,7 @@ int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsign
                DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
                thread->index = i;
                thread->cullface = GL_BACK;
+               thread->colormask[0] = 1; 
                thread->colormask[1] = 1;
                thread->colormask[2] = 1;
                thread->colormask[3] = 1;
@@ -5408,8 +5448,10 @@ int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsign
                thread->depthrange[1] = 1;
                thread->polygonoffset[0] = 0;
                thread->polygonoffset[1] = 0;
-       
-               DPSOFTRAST_RecalcThread(thread);
+               thread->clipplane[0] = 0;
+               thread->clipplane[1] = 0;
+               thread->clipplane[2] = 0;
+               thread->clipplane[3] = 1;
        
                thread->numspans = 0;
                thread->numtriangles = 0;