]> git.xonotic.org Git - xonotic/darkplaces.git/blobdiff - dpsoftrast.c
enabled more features for SHADERMODE_GENERIC
[xonotic/darkplaces.git] / dpsoftrast.c
index cbd6629b573659a53e3a7a78e7e1402db2facd42..b1a7ca68bef72b984f489bf98a6c7e88f594cf0f 100644 (file)
 typedef enum bool {false, true} bool;
 #endif
 
+#if defined(__GNUC__) || (defined(_MSC_VER) && _MSC_VER >= 1400)
+#define RESTRICT __restrict
+#else
+#define RESTRICT
+#endif
+
 #define GL_NONE                                        0
 #define GL_FRONT_LEFT                  0x0400
 #define GL_FRONT_RIGHT                 0x0401
@@ -78,9 +84,6 @@ typedef struct DPSOFTRAST_Texture_s
        int size;
        unsigned char *bytes;
        int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
-       int clampmin[3];
-       int clampmax[3];
-       int wrapmask[3];
 }
 DPSOFTRAST_Texture;
 
@@ -105,12 +108,15 @@ typedef struct DPSOFTRAST_State_User_s
 }
 DPSOFTRAST_State_User;
 
+#define DPSOFTRAST_MAXSUBSPAN 16
+
 typedef struct DPSOFTRAST_State_Draw_Span_s
 {
        int start; // pixel index
        int length; // pixel count
        int startx; // usable range (according to pixelmask)
        int endx; // usable range (according to pixelmask)
+       unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
        unsigned char *pixelmask; // true for pixels that passed depth test, false for others
        // [0][n][] is start interpolant values (projected)
        // [1][n][] is end interpolant values (projected)
@@ -154,6 +160,7 @@ typedef enum DPSOFTRAST_BLENDMODE_e
        DPSOFTRAST_BLENDMODE_MUL,
        DPSOFTRAST_BLENDMODE_MUL2,
        DPSOFTRAST_BLENDMODE_SUBALPHA,
+       DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
        DPSOFTRAST_BLENDMODE_TOTAL
 }
 DPSOFTRAST_BLENDMODE;
@@ -280,6 +287,7 @@ int blendmodetable[][4] =
        {DPSOFTRAST_BLENDMODE_MUL, GL_ZERO, GL_SRC_COLOR, false},
        {DPSOFTRAST_BLENDMODE_MUL, GL_DST_COLOR, GL_ZERO, false},
        {DPSOFTRAST_BLENDMODE_MUL2, GL_DST_COLOR, GL_SRC_COLOR, false},
+       {DPSOFTRAST_BLENDMODE_PSEUDOALPHA, GL_ONE, GL_ONE_MINUS_SRC_ALPHA, false},
        {DPSOFTRAST_BLENDMODE_SUBALPHA, GL_SRC_COLOR, GL_ONE, true}
 };
 
@@ -451,15 +459,6 @@ int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
 
        // allocate the pixels now
        texture->bytes = calloc(1, size);
-       texture->clampmin[0] = 0;
-       texture->clampmin[1] = 0;
-       texture->clampmin[2] = 0;
-       texture->clampmax[0] = texture->width-1;
-       texture->clampmax[1] = texture->height-1;
-       texture->clampmax[2] = texture->depth-1;
-       texture->wrapmask[0] = texture->width-1;
-       texture->wrapmask[1] = texture->height-1;
-       texture->wrapmask[2] = texture->depth-1;
 
        return texnum;
 }
@@ -499,18 +498,18 @@ void DPSOFTRAST_Texture_CalculateMipmaps(int index)
                                row1 = y*2+1;
                                if (row1 >= texture->mipmap[i-1][3])
                                        row1 = texture->mipmap[i-1][3]-1;
-                               o =  texture->bytes + texture->mipmap[i  ][0] + 4*(texture->mipmap[i  ][3] * texture->mipmap[i  ][2] * z      + texture->mipmap[i  ][2] * y   );
-                               i0 = texture->bytes + texture->mipmap[i-1][0] + 4*(texture->mipmap[i-1][3] * texture->mipmap[i-1][2] * layer0 + texture->mipmap[i-1][2] * row0);
-                               i1 = texture->bytes + texture->mipmap[i-1][0] + 4*(texture->mipmap[i-1][3] * texture->mipmap[i-1][2] * layer0 + texture->mipmap[i-1][2] * row1);
-                               i2 = texture->bytes + texture->mipmap[i-1][0] + 4*(texture->mipmap[i-1][3] * texture->mipmap[i-1][2] * layer1 + texture->mipmap[i-1][2] * row0);
-                               i3 = texture->bytes + texture->mipmap[i-1][0] + 4*(texture->mipmap[i-1][3] * texture->mipmap[i-1][2] * layer1 + texture->mipmap[i-1][2] * row1);
+                               o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
+                               i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
+                               i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
+                               i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
+                               i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
                                w = texture->mipmap[i][2];
                                if (layer1 > layer0)
                                {
                                        if (texture->mipmap[i-1][2] > 1)
                                        {
                                                // average 3D texture
-                                               for (x = 0;x < w;x++)
+                                               for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
                                                {
                                                        o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
                                                        o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
@@ -521,7 +520,7 @@ void DPSOFTRAST_Texture_CalculateMipmaps(int index)
                                        else
                                        {
                                                // average 3D mipmap with parent width == 1
-                                               for (x = 0;x < w;x++)
+                                               for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
                                                {
                                                        o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
                                                        o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
@@ -535,7 +534,7 @@ void DPSOFTRAST_Texture_CalculateMipmaps(int index)
                                        if (texture->mipmap[i-1][2] > 1)
                                        {
                                                // average 2D texture (common case)
-                                               for (x = 0;x < w;x++)
+                                               for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
                                                {
                                                        o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
                                                        o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
@@ -546,13 +545,10 @@ void DPSOFTRAST_Texture_CalculateMipmaps(int index)
                                        else
                                        {
                                                // 2D texture with parent width == 1
-                                               for (x = 0;x < w;x++)
-                                               {
-                                                       o[0] = (i0[0] + i1[0] + 1) >> 1;
-                                                       o[1] = (i0[1] + i1[1] + 1) >> 1;
-                                                       o[2] = (i0[2] + i1[2] + 1) >> 1;
-                                                       o[3] = (i0[3] + i1[3] + 1) >> 1;
-                                               }
+                                               o[0] = (i0[0] + i1[0] + 1) >> 1;
+                                               o[1] = (i0[1] + i1[1] + 1) >> 1;
+                                               o[2] = (i0[2] + i1[2] + 1) >> 1;
+                                               o[3] = (i0[3] + i1[3] + 1) >> 1;
                                        }
                                }
                        }
@@ -945,7 +941,7 @@ void DPSOFTRAST_Uniform1iARB(DPSOFTRAST_UNIFORM index, int i0)
        dpsoftrast.uniform1i[index] = i0;
 }
 
-void DPSOFTRAST_Draw_LoadVertices(int firstvertex, int numvertices, bool needcolors, int numtexcoords)
+void DPSOFTRAST_Draw_LoadVertices(int firstvertex, int numvertices, bool needcolors)
 {
        int i;
        int j;
@@ -1029,7 +1025,7 @@ void DPSOFTRAST_Draw_LoadVertices(int firstvertex, int numvertices, bool needcol
                        }
                }
        }
-       for (j = 0;j < numtexcoords;j++)
+       for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL-2;j++)
        {
                if (dpsoftrast.pointer_texcoordf[j])
                {
@@ -1078,10 +1074,17 @@ void DPSOFTRAST_Draw_LoadVertices(int firstvertex, int numvertices, bool needcol
 
 void DPSOFTRAST_Array_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
 {
+       static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
        // TODO: SIMD
        float matrix[4][4];
        int i;
        memcpy(matrix, inmatrix16f, sizeof(float[16]));
+       if (!memcmp(identitymatrix, matrix, sizeof(float[16])))
+       {
+               // fast case for identity matrix
+               memcpy(out4f, in4f, numitems * sizeof(float[4]));
+               return;
+       }
        for (i = 0;i < numitems;i++, out4f += 4, in4f += 4)
        {
                out4f[0] = in4f[0] * matrix[0][0] + in4f[1] * matrix[1][0] + in4f[2] * matrix[2][0] + in4f[3] * matrix[3][0];
@@ -1169,6 +1172,10 @@ void DPSOFTRAST_Draw_DebugEdgePoints(const float *screen0, const float *screen1)
        }
 }
 
+void DPSOFTRAST_Draw_VertexShaderLightDirection(void)
+{
+}
+
 void DPSOFTRAST_Draw_Span_Begin(const DPSOFTRAST_State_Draw_Span *span, float *zf)
 {
        int x;
@@ -1176,20 +1183,33 @@ void DPSOFTRAST_Draw_Span_Begin(const DPSOFTRAST_State_Draw_Span *span, float *z
        int endx = span->endx;
        float w = span->data[0][DPSOFTRAST_ARRAY_TOTAL][3];
        float wslope = span->data[1][DPSOFTRAST_ARRAY_TOTAL][3];
-       // TODO: optimize by approximating every 8 pixels?
-       for (x = startx;x < endx;x++)
-               zf[x] = 1.0f / (w + wslope * x);
+       for (x = startx;x < endx;)
+       {
+               int endsub = x + DPSOFTRAST_MAXSUBSPAN-1;
+               float z = 1.0f / (w + wslope * x), dz;
+               if (endsub >= endx)
+               {
+                       endsub = endx-1;
+                       dz = endsub > x ? (1.0f / (w + wslope * endsub) - z) / (endsub - x) : 0.0f;
+               }
+               else
+               {
+                       dz = (1.0f / (w + wslope * endsub) - z) * (1.0f / (DPSOFTRAST_MAXSUBSPAN-1));
+               }
+               for (; x <= endsub; x++, z += dz)
+                       zf[x] = z;
+       }
 }
 
-void DPSOFTRAST_Draw_Span_Finish(const DPSOFTRAST_State_Draw_Span *span, const float *in4f)
+void DPSOFTRAST_Draw_Span_Finish(const DPSOFTRAST_State_Draw_Span *span, const float * RESTRICT in4f)
 {
        int x;
        int startx = span->startx;
        int endx = span->endx;
        int d[4];
        float a, b;
-       unsigned char *pixelmask = span->pixelmask;
-       unsigned char *pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
+       unsigned char * RESTRICT pixelmask = span->pixelmask;
+       unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
        if (!pixel)
                return;
        pixel += span->start * 4;
@@ -1325,10 +1345,27 @@ void DPSOFTRAST_Draw_Span_Finish(const DPSOFTRAST_State_Draw_Span *span, const f
                        pixel[x*4+3] = d[3];
                }
                break;
+       case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
+               for (x = startx;x < endx;x++)
+               {
+                       if (!pixelmask[x])
+                               continue;
+                       a = 255.0f;
+                       b = 1.0f - in4f[x*4+3];
+                       d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
+                       d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
+                       d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
+                       d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
+                       pixel[x*4+0] = d[0];
+                       pixel[x*4+1] = d[1];
+                       pixel[x*4+2] = d[2];
+                       pixel[x*4+3] = d[3];
+               }
+               break;
        }
 }
 
-void DPSOFTRAST_Draw_Span_Texture2DVarying(const DPSOFTRAST_State_Draw_Span *span, float *out4f, int texunitindex, int arrayindex, const float *zf)
+void DPSOFTRAST_Draw_Span_Texture2DVarying(const DPSOFTRAST_State_Draw_Span *span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
 {
        int x;
        int startx = span->startx;
@@ -1337,21 +1374,18 @@ void DPSOFTRAST_Draw_Span_Texture2DVarying(const DPSOFTRAST_State_Draw_Span *spa
        float c[4];
        float data[4];
        float slope[4];
-       float z;
        float tc[2];
-       float frac[2];
-       float ifrac[2];
-       float lerp[4];
        float tcscale[2];
-       int tci[2];
-       int tci1[2];
-       int tcimin[2];
-       int tcimax[2];
+       unsigned int tci[2];
+       unsigned int tci1[2];
+       unsigned int tcimin[2];
+       unsigned int tcimax[2];
        int tciwrapmask[2];
        int tciwidth;
        int filter;
-       unsigned char *pixelbase;
-       unsigned char *pixel[4];
+       int mip;
+       const unsigned char * RESTRICT pixelbase;
+       const unsigned char * RESTRICT pixel[4];
        DPSOFTRAST_Texture *texture = dpsoftrast.texbound[texunitindex];
        // if no texture is bound, just fill it with white
        if (!texture)
@@ -1365,8 +1399,9 @@ void DPSOFTRAST_Draw_Span_Texture2DVarying(const DPSOFTRAST_State_Draw_Span *spa
                }
                return;
        }
-       // if texture is 1 pixel, just fill it with that color
-       if (texture->width * texture->height * texture->depth * texture->sides == 1)
+       mip = span->mip[texunitindex];
+       // if this mipmap of the texture is 1 pixel, just fill it with that color
+       if (texture->mipmap[mip][1] == 4)
        {
                c[0] = texture->bytes[2] * (1.0f/255.0f);
                c[1] = texture->bytes[1] * (1.0f/255.0f);
@@ -1381,7 +1416,7 @@ void DPSOFTRAST_Draw_Span_Texture2DVarying(const DPSOFTRAST_State_Draw_Span *spa
                }
                return;
        }
-       filter = dpsoftrast.texbound[texunitindex]->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
+       filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
        data[0] = span->data[0][arrayindex][0];
        data[1] = span->data[0][arrayindex][1];
        data[2] = span->data[0][arrayindex][2];
@@ -1390,101 +1425,134 @@ void DPSOFTRAST_Draw_Span_Texture2DVarying(const DPSOFTRAST_State_Draw_Span *spa
        slope[1] = span->data[1][arrayindex][1];
        slope[2] = span->data[1][arrayindex][2];
        slope[3] = span->data[1][arrayindex][3];
-       flags = dpsoftrast.texbound[texunitindex]->flags;
-       pixelbase = (unsigned char *)dpsoftrast.texbound[texunitindex]->bytes;
-       tcscale[0] = dpsoftrast.texbound[texunitindex]->width;
-       tcscale[1] = dpsoftrast.texbound[texunitindex]->height;
-       tciwidth = dpsoftrast.texbound[texunitindex]->width;
-       tcimin[0] = dpsoftrast.texbound[texunitindex]->clampmin[0];
-       tcimin[1] = dpsoftrast.texbound[texunitindex]->clampmin[1];
-       tcimax[0] = dpsoftrast.texbound[texunitindex]->clampmax[0];
-       tcimax[1] = dpsoftrast.texbound[texunitindex]->clampmax[1];
-       tciwrapmask[0] = dpsoftrast.texbound[texunitindex]->wrapmask[0];
-       tciwrapmask[1] = dpsoftrast.texbound[texunitindex]->wrapmask[1];
-       if (filter)
+       flags = texture->flags;
+       pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
+       tcscale[0] = texture->mipmap[mip][2];
+       tcscale[1] = texture->mipmap[mip][3];
+       tciwidth = texture->mipmap[mip][2];
+       tcimin[0] = 0;
+       tcimin[1] = 0;
+       tcimax[0] = texture->mipmap[mip][2]-1;
+       tcimax[1] = texture->mipmap[mip][3]-1;
+       tciwrapmask[0] = texture->mipmap[mip][2]-1;
+       tciwrapmask[1] = texture->mipmap[mip][3]-1;
+       for (x = startx;x < endx;)
        {
-               if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
+               float endtc[2];
+               unsigned int subtc[2];
+               unsigned int substep[2];
+               int endsub = x + DPSOFTRAST_MAXSUBSPAN-1;
+               float subscale = 4096.0f/(DPSOFTRAST_MAXSUBSPAN-1);
+               if (endsub >= endx)
                {
-                       for (x = startx;x < endx;x++)
-                       {
-                               z = zf[x];
-                               tc[0] = (data[0] + slope[0]*x) * z * tcscale[0];
-                               tc[1] = (data[1] + slope[1]*x) * z * tcscale[1];
-                               tci[0] = (int)floor(tc[0]);
-                               tci[1] = (int)floor(tc[1]);
-                               tci1[0] = tci[0] + 1;
-                               tci1[1] = tci[1] + 1;
-                               frac[0] = tc[0] - tci[0];ifrac[0] = 1.0f - frac[0];
-                               frac[1] = tc[1] - tci[1];ifrac[1] = 1.0f - frac[1];
-                               lerp[0] = ifrac[0]*ifrac[1];
-                               lerp[1] =  frac[0]*ifrac[1];
-                               lerp[2] = ifrac[0]* frac[1];
-                               lerp[3] =  frac[0]* frac[1];
-                               tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
-                               tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
-                               tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
-                               tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
-                               pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
-                               pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
-                               pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
-                               pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
-                               c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 255.0f);
-                               c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 255.0f);
-                               c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 255.0f);
-                               c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 255.0f);
-                               out4f[x*4+0] = c[0];
-                               out4f[x*4+1] = c[1];
-                               out4f[x*4+2] = c[2];
-                               out4f[x*4+3] = c[3];
-                       }
+                       endsub = endx-1;
+                       subscale = endsub > x ? 4096.0f / (endsub - x) : 1.0f;
                }
-               else
+               tc[0] = (data[0] + slope[0]*x) * zf[x] * tcscale[0] - 0.5f;
+               tc[1] = (data[1] + slope[1]*x) * zf[x] * tcscale[1] - 0.5f;
+               endtc[0] = (data[0] + slope[0]*endsub) * zf[endsub] * tcscale[0] - 0.5f;
+               endtc[1] = (data[1] + slope[1]*endsub) * zf[endsub] * tcscale[1] - 0.5f;
+               substep[0] = (endtc[0] - tc[0]) * subscale;
+               substep[1] = (endtc[1] - tc[1]) * subscale;
+               subtc[0] = tc[0] * (1<<12);
+               subtc[1] = tc[1] * (1<<12);
+               if (!(flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE))
+               {
+                       subtc[0] &= (tciwrapmask[0]<<12)|0xFFF;
+                       subtc[1] &= (tciwrapmask[1]<<12)|0xFFF;
+               }
+               if(filter)
                {
-                       for (x = startx;x < endx;x++)
+                       tci[0] = (subtc[0]>>12) - tcimin[0];
+                       tci[1] = (subtc[1]>>12) - tcimin[0];
+                       tci1[0] = ((subtc[0] + (endsub - x)*substep[0])>>12) + 1;
+                       tci1[1] = ((subtc[1] + (endsub - x)*substep[1])>>12) + 1;
+                       if (tci[0] <= tcimax[0] && tci[1] <= tcimax[1] && tci1[0] <= tcimax[0] && tci1[1] <= tcimax[1])
                        {
-                               z = zf[x];
-                               tc[0] = (data[0] + slope[0]*x) * z * tcscale[0];
-                               tc[1] = (data[1] + slope[1]*x) * z * tcscale[1];
-                               tci[0] = (int)floor(tc[0]);
-                               tci[1] = (int)floor(tc[1]);
-                               tci1[0] = tci[0] + 1;
-                               tci1[1] = tci[1] + 1;
-                               frac[0] = tc[0] - tci[0];ifrac[0] = 1.0f - frac[0];
-                               frac[1] = tc[1] - tci[1];ifrac[1] = 1.0f - frac[1];
-                               lerp[0] = ifrac[0]*ifrac[1];
-                               lerp[1] =  frac[0]*ifrac[1];
-                               lerp[2] = ifrac[0]* frac[1];
-                               lerp[3] =  frac[0]* frac[1];
-                               tci[0] &= tciwrapmask[0];
-                               tci[1] &= tciwrapmask[1];
-                               tci1[0] &= tciwrapmask[0];
-                               tci1[1] &= tciwrapmask[1];
-                               pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
-                               pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
-                               pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
-                               pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
-                               c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 255.0f);
-                               c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 255.0f);
-                               c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 255.0f);
-                               c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 255.0f);
-                               out4f[x*4+0] = c[0];
-                               out4f[x*4+1] = c[1];
-                               out4f[x*4+2] = c[2];
-                               out4f[x*4+3] = c[3];
+                               for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
+                               {
+                                       unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
+                                       unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
+                                       unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
+                                       tci[0] = subtc[0]>>12;
+                                       tci[1] = subtc[1]>>12;
+                                       pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
+                                       pixel[1] = pixel[0] + 4 * tciwidth;
+                                       c[0] = (pixel[0][2]*lerp[0]+pixel[0][4+2]*lerp[1]+pixel[1][2]*lerp[2]+pixel[1][4+2]*lerp[3]) * (1.0f / 0xFF000000);
+                                       c[1] = (pixel[0][1]*lerp[0]+pixel[0][4+1]*lerp[1]+pixel[1][1]*lerp[2]+pixel[1][4+1]*lerp[3]) * (1.0f / 0xFF000000);
+                                       c[2] = (pixel[0][0]*lerp[0]+pixel[0][4+0]*lerp[1]+pixel[1][0]*lerp[2]+pixel[1][4+0]*lerp[3]) * (1.0f / 0xFF000000);
+                                       c[3] = (pixel[0][3]*lerp[0]+pixel[0][4+3]*lerp[1]+pixel[1][3]*lerp[2]+pixel[1][4+3]*lerp[3]) * (1.0f / 0xFF000000);
+                                       out4f[x*4+0] = c[0];
+                                       out4f[x*4+1] = c[1];
+                                       out4f[x*4+2] = c[2];
+                                       out4f[x*4+3] = c[3];
+                               }
+                       }
+                       else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
+                       {
+                               for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
+                               {
+                                       unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
+                                       unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
+                                       unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
+                                       tci[0] = subtc[0]>>12;
+                                       tci[1] = subtc[1]>>12;
+                                       tci1[0] = tci[0] + 1;
+                                       tci1[1] = tci[1] + 1;
+                                       tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
+                                       tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
+                                       tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
+                                       tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
+                                       pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
+                                       pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
+                                       pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
+                                       pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
+                                       c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
+                                       c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
+                                       c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
+                                       c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
+                                       out4f[x*4+0] = c[0];
+                                       out4f[x*4+1] = c[1];
+                                       out4f[x*4+2] = c[2];
+                                       out4f[x*4+3] = c[3];
+                               }
+                       }
+                       else
+                       {
+                               for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
+                               {
+                                       unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
+                                       unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
+                                       unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
+                                       tci[0] = subtc[0]>>12;
+                                       tci[1] = subtc[1]>>12;
+                                       tci1[0] = tci[0] + 1;
+                                       tci1[1] = tci[1] + 1;
+                                       tci[0] &= tciwrapmask[0];
+                                       tci[1] &= tciwrapmask[1];
+                                       tci1[0] &= tciwrapmask[0];
+                                       tci1[1] &= tciwrapmask[1];
+                                       pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
+                                       pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
+                                       pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
+                                       pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
+                                       c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
+                                       c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
+                                       c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
+                                       c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
+                                       out4f[x*4+0] = c[0];
+                                       out4f[x*4+1] = c[1];
+                                       out4f[x*4+2] = c[2];
+                                       out4f[x*4+3] = c[3];
+                               }
                        }
                }
-       }
-       else
-       {
-               if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
+               else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
                {
-                       for (x = startx;x < endx;x++)
+                       for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
                        {
-                               z = zf[x];
-                               tc[0] = (data[0] + slope[0]*x) * z * tcscale[0];
-                               tc[1] = (data[1] + slope[1]*x) * z * tcscale[1];
-                               tci[0] = (int)floor(tc[0]);
-                               tci[1] = (int)floor(tc[1]);
+                               tci[0] = subtc[0]>>12;
+                               tci[1] = subtc[1]>>12;
                                tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
                                tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
                                pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
@@ -1500,13 +1568,10 @@ void DPSOFTRAST_Draw_Span_Texture2DVarying(const DPSOFTRAST_State_Draw_Span *spa
                }
                else
                {
-                       for (x = startx;x < endx;x++)
+                       for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
                        {
-                               z = zf[x];
-                               tc[0] = (data[0] + slope[0]*x) * z * tcscale[0];
-                               tc[1] = (data[1] + slope[1]*x) * z * tcscale[1];
-                               tci[0] = (int)floor(tc[0]);
-                               tci[1] = (int)floor(tc[1]);
+                               tci[0] = subtc[0]>>12;
+                               tci[1] = subtc[1]>>12;
                                tci[0] &= tciwrapmask[0];
                                tci[1] &= tciwrapmask[1];
                                pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
@@ -1554,6 +1619,37 @@ void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Draw_Span *span
        }
 }
 
+void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Draw_Span *span, float *out4f, int arrayindex, const float *zf)
+{
+       int x;
+       int startx = span->startx;
+       int endx = span->endx;
+       float c[4];
+       float data[4];
+       float slope[4];
+       float z;
+       data[0] = span->data[0][arrayindex][0];
+       data[1] = span->data[0][arrayindex][1];
+       data[2] = span->data[0][arrayindex][2];
+       data[3] = span->data[0][arrayindex][3];
+       slope[0] = span->data[1][arrayindex][0];
+       slope[1] = span->data[1][arrayindex][1];
+       slope[2] = span->data[1][arrayindex][2];
+       slope[3] = span->data[1][arrayindex][3];
+       for (x = startx;x < endx;x++)
+       {
+               z = zf[x];
+               c[0] = (data[0] + slope[0]*x) * z;
+               c[1] = (data[1] + slope[1]*x) * z;
+               c[2] = (data[2] + slope[2]*x) * z;
+               c[3] = (data[3] + slope[3]*x) * z;
+               out4f[x*4+0] = c[0];
+               out4f[x*4+1] = c[1];
+               out4f[x*4+2] = c[2];
+               out4f[x*4+3] = c[3];
+       }
+}
+
 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Draw_Span *span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
 {
        int x, startx = span->startx, endx = span->endx;
@@ -1575,6 +1671,45 @@ void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Draw_Span *span, float
        }
 }
 
+void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Draw_Span *span, float *out4f, const float *ina4f, const float *inb4f)
+{
+       int x, startx = span->startx, endx = span->endx;
+       for (x = startx;x < endx;x++)
+       {
+               out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
+               out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
+               out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
+               out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
+       }
+}
+
+void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Draw_Span *span, float *out4f, const float *ina4f, const float *inb4f)
+{
+       int x, startx = span->startx, endx = span->endx;
+       for (x = startx;x < endx;x++)
+       {
+               out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
+               out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
+               out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
+               out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
+       }
+}
+
+void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Draw_Span *span, float *out4f, const float *ina4f, const float *inb4f)
+{
+       int x, startx = span->startx, endx = span->endx;
+       float a, b;
+       for (x = startx;x < endx;x++)
+       {
+               a = 1.0f - inb4f[x*4+3];
+               b = inb4f[x*4+3];
+               out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
+               out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
+               out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
+               out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
+       }
+}
+
 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Draw_Span *span, float *out4f, const float *in4f, const float *color)
 {
        int x, startx = span->startx, endx = span->endx;
@@ -1594,7 +1729,7 @@ void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Draw_Span *span
        }
 }
 
-void DPSOFTRAST_Draw_Span_Lightmap(const DPSOFTRAST_State_Draw_Span *span, float *out4f, const float *diffuse, const float *lightmap)
+void DPSOFTRAST_Draw_Span_Lightmap(const DPSOFTRAST_State_Draw_Span *span, float * RESTRICT out4f, const float * RESTRICT diffuse, const float * RESTRICT lightmap)
 {
        int x, startx = span->startx, endx = span->endx;
        float Color_Ambient[4], Color_Diffuse[4];
@@ -1615,6 +1750,39 @@ void DPSOFTRAST_Draw_Span_Lightmap(const DPSOFTRAST_State_Draw_Span *span, float
        }
 }
 
+void DPSOFTRAST_Draw_Span_Lightmap_Finish(const DPSOFTRAST_State_Draw_Span *span, const float * RESTRICT diffuse, const float * RESTRICT lightmap)
+{
+       int x, startx = span->startx, endx = span->endx;
+       int d[4];
+       float Color_Ambient[4], Color_Diffuse[4];
+       unsigned char * RESTRICT pixelmask = span->pixelmask;
+       unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
+       if (!pixel)
+               return;
+       pixel += span->start * 4;
+       Color_Ambient[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0]*255.0f;
+       Color_Ambient[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1]*255.0f;
+       Color_Ambient[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2]*255.0f;
+       Color_Ambient[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f;
+       Color_Diffuse[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0]*255.0f;
+       Color_Diffuse[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1]*255.0f;
+       Color_Diffuse[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2]*255.0f;
+       Color_Diffuse[3] = 0.0f;
+       for (x = startx;x < endx;x++)
+       {
+               if (!pixelmask[x])
+                       continue;
+               d[0] = diffuse[x*4+0] * (Color_Ambient[0] + lightmap[x*4+0] * Color_Diffuse[0]);if (d[0] > 255) d[0] = 255;
+               d[1] = diffuse[x*4+1] * (Color_Ambient[1] + lightmap[x*4+1] * Color_Diffuse[1]);if (d[1] > 255) d[1] = 255;
+               d[2] = diffuse[x*4+2] * (Color_Ambient[2] + lightmap[x*4+2] * Color_Diffuse[2]);if (d[2] > 255) d[2] = 255;
+               d[3] = diffuse[x*4+3] * (Color_Ambient[3] + lightmap[x*4+3] * Color_Diffuse[3]);if (d[3] > 255) d[3] = 255;
+               pixel[x*4+0] = d[2];
+               pixel[x*4+1] = d[1];
+               pixel[x*4+2] = d[0];
+               pixel[x*4+3] = d[3];
+       }
+}
+
 void DPSOFTRAST_Draw_Span_VertexColor(const DPSOFTRAST_State_Draw_Span *span, float *out4f, const float *diffuse, const float *zf)
 {
        int x, startx = span->startx, endx = span->endx;
@@ -1679,6 +1847,8 @@ void DPSOFTRAST_Draw_VertexShader(void)
        case SHADERMODE_GENERIC: ///< (particles/HUD/etc) vertex color: optionally multiplied by one texture
                DPSOFTRAST_Array_Copy(dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_COLOR], dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_COLOR], dpsoftrast.draw.numvertices);
                DPSOFTRAST_Array_Copy(dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.draw.numvertices);
+               if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
+                       DPSOFTRAST_Array_Copy(dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1], dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1], dpsoftrast.draw.numvertices);
                break;
        case SHADERMODE_POSTPROCESS: ///< postprocessing shader (r_glsl_postprocess)
                DPSOFTRAST_Array_Copy(dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.draw.numvertices);
@@ -1687,14 +1857,14 @@ void DPSOFTRAST_Draw_VertexShader(void)
        case SHADERMODE_DEPTH_OR_SHADOW: ///< (depthfirst/shadows) vertex shader only
                break;
        case SHADERMODE_FLATCOLOR: ///< (lightmap) modulate texture by uniform color (q1bsp: q3bsp)
-               DPSOFTRAST_Array_Copy(dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.draw.numvertices);
+               DPSOFTRAST_Array_Transform(dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.draw.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
                break;
        case SHADERMODE_VERTEXCOLOR: ///< (lightmap) modulate texture by vertex colors (q3bsp)
                DPSOFTRAST_Array_Copy(dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_COLOR], dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_COLOR], dpsoftrast.draw.numvertices);
-               DPSOFTRAST_Array_Copy(dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.draw.numvertices);
+               DPSOFTRAST_Array_Transform(dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.draw.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
                break;
        case SHADERMODE_LIGHTMAP: ///< (lightmap) modulate texture by lightmap texture (q1bsp: q3bsp)
-               DPSOFTRAST_Array_Copy(dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.draw.numvertices);
+               DPSOFTRAST_Array_Transform(dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.draw.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
                DPSOFTRAST_Array_Copy(dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD4], dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD4], dpsoftrast.draw.numvertices);
                break;
        case SHADERMODE_FAKELIGHT: ///< (fakelight) modulate texture by "fake" lighting (no lightmaps: no nothing)
@@ -1704,6 +1874,8 @@ void DPSOFTRAST_Draw_VertexShader(void)
        case SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE: ///< (lightmap) use directional pixel shading from texture containing tangentspace light directions (q1bsp deluxemap)
                break;
        case SHADERMODE_LIGHTDIRECTION: ///< (lightmap) use directional pixel shading from fixed light direction (q3bsp)
+               DPSOFTRAST_Array_Transform(dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.draw.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.draw.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
+               DPSOFTRAST_Draw_VertexShaderLightDirection();
                break;
        case SHADERMODE_LIGHTSOURCE: ///< (lightsource) use directional pixel shading from light source (rtlight)
                break;
@@ -1732,8 +1904,32 @@ void DPSOFTRAST_Draw_PixelShaderSpan(const DPSOFTRAST_State_Draw_Span *span)
        {
        case SHADERMODE_GENERIC: ///< (particles/HUD/etc) vertex color: optionally multiplied by one texture
                DPSOFTRAST_Draw_Span_Begin(span, buffer_z);
-               DPSOFTRAST_Draw_Span_Texture2DVarying(span, buffer_texture_color, GL20TU_FIRST, 2, buffer_z);
-               DPSOFTRAST_Draw_Span_MultiplyVarying(span, buffer_FragColor, buffer_texture_color, 1, buffer_z);
+               if (dpsoftrast.shader_permutation & SHADERPERMUTATION_DIFFUSE)
+               {
+                       DPSOFTRAST_Draw_Span_Texture2DVarying(span, buffer_texture_color, GL20TU_FIRST, 2, buffer_z);
+                       DPSOFTRAST_Draw_Span_MultiplyVarying(span, buffer_FragColor, buffer_texture_color, 1, buffer_z);
+                       if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
+                       {
+                               DPSOFTRAST_Draw_Span_Texture2DVarying(span, buffer_texture_lightmap, GL20TU_SECOND, 2, buffer_z);
+                               if (dpsoftrast.shader_permutation & SHADERPERMUTATION_COLORMAPPING)
+                               {
+                                       // multiply
+                                       DPSOFTRAST_Draw_Span_MultiplyBuffers(span, buffer_FragColor, buffer_FragColor, buffer_texture_lightmap);
+                               }
+                               else if (dpsoftrast.shader_permutation & SHADERPERMUTATION_COLORMAPPING)
+                               {
+                                       // add
+                                       DPSOFTRAST_Draw_Span_AddBuffers(span, buffer_FragColor, buffer_FragColor, buffer_texture_lightmap);
+                               }
+                               else if (dpsoftrast.shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
+                               {
+                                       // alphablend
+                                       DPSOFTRAST_Draw_Span_MixBuffers(span, buffer_FragColor, buffer_FragColor, buffer_texture_lightmap);
+                               }
+                       }
+               }
+               else
+                       DPSOFTRAST_Draw_Span_Varying(span, buffer_FragColor, 1, buffer_z);
                DPSOFTRAST_Draw_Span_Finish(span, buffer_FragColor);
                break;
        case SHADERMODE_POSTPROCESS: ///< postprocessing shader (r_glsl_postprocess)
@@ -1774,8 +1970,15 @@ void DPSOFTRAST_Draw_PixelShaderSpan(const DPSOFTRAST_State_Draw_Span *span)
                DPSOFTRAST_Draw_Span_Begin(span, buffer_z);
                DPSOFTRAST_Draw_Span_Texture2DVarying(span, buffer_texture_color, GL20TU_COLOR, 2, buffer_z);
                DPSOFTRAST_Draw_Span_Texture2DVarying(span, buffer_texture_lightmap, GL20TU_LIGHTMAP, 6, buffer_z);
-               DPSOFTRAST_Draw_Span_Lightmap(span, buffer_FragColor, buffer_texture_color, buffer_texture_lightmap);
-               DPSOFTRAST_Draw_Span_Finish(span, buffer_FragColor);
+               if(!dpsoftrast.user.alphatest && dpsoftrast.fb_blendmode == DPSOFTRAST_BLENDMODE_OPAQUE)
+               {
+                       DPSOFTRAST_Draw_Span_Lightmap_Finish(span, buffer_texture_color, buffer_texture_lightmap);
+               }
+               else
+               {
+                       DPSOFTRAST_Draw_Span_Lightmap(span, buffer_FragColor, buffer_texture_color, buffer_texture_lightmap);
+                       DPSOFTRAST_Draw_Span_Finish(span, buffer_FragColor);
+               }
                break;
        case SHADERMODE_FAKELIGHT: ///< (fakelight) modulate texture by "fake" lighting (no lightmaps: no nothing)
                break;
@@ -1880,7 +2083,7 @@ void DPSOFTRAST_Draw_ProcessSpans(void)
        }
 }
 
-void DPSOFTRAST_Draw_ProcessTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s, int numarrays)
+void DPSOFTRAST_Draw_ProcessTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s, unsigned char *arraymask)
 {
        int cullface = dpsoftrast.user.cullface;
        int width = dpsoftrast.fb_width;
@@ -1899,9 +2102,19 @@ void DPSOFTRAST_Draw_ProcessTriangles(int firstvertex, int numvertices, int numt
        int edge0n;
        int edge1p;
        int edge1n;
-       int extent[4];
+       int extent[6];
        int startx;
        int endx;
+       float mip_edge0tc[2];
+       float mip_edge1tc[2];
+       float mip_edge0xy[2];
+       float mip_edge1xy[2];
+       float mip_edge0xymul;
+       float mip_edge1xymul;
+       float mip_edge0mip;
+       float mip_edge1mip;
+       float mipdensity;
+       unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS];
        float startxf;
        float endxf;
        float edge0ylerp;
@@ -1923,6 +2136,7 @@ void DPSOFTRAST_Draw_ProcessTriangles(int firstvertex, int numvertices, int numt
        float clipped[DPSOFTRAST_ARRAY_TOTAL][4][4];
        float screen[4][4];
        float proj[DPSOFTRAST_ARRAY_TOTAL][4][4];
+       DPSOFTRAST_Texture *texture;
        DPSOFTRAST_State_Draw_Span *span;
        DPSOFTRAST_State_Draw_Span *oldspan;
        for (i = 0;i < numtriangles;i++)
@@ -1993,21 +2207,27 @@ void DPSOFTRAST_Draw_ProcessTriangles(int firstvertex, int numvertices, int numt
 #define CLIPPEDVERTEXLERP(k,p1,p2) \
                        frac = clipdist[p1] / (clipdist[p1] - clipdist[p2]);\
                        ifrac = 1.0f - frac;\
-                       for (j = 0;j < numarrays;j++)\
+                       for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL;j++)\
                        {\
-                               clipped[j][k][0] = dpsoftrast.draw.post_array4f[j][e[p1]*4+0]*ifrac+dpsoftrast.draw.post_array4f[j][e[p2]*4+0]*frac;\
-                               clipped[j][k][1] = dpsoftrast.draw.post_array4f[j][e[p1]*4+1]*ifrac+dpsoftrast.draw.post_array4f[j][e[p2]*4+1]*frac;\
-                               clipped[j][k][2] = dpsoftrast.draw.post_array4f[j][e[p1]*4+2]*ifrac+dpsoftrast.draw.post_array4f[j][e[p2]*4+2]*frac;\
-                               clipped[j][k][3] = dpsoftrast.draw.post_array4f[j][e[p1]*4+3]*ifrac+dpsoftrast.draw.post_array4f[j][e[p2]*4+3]*frac;\
+                               if (arraymask[j])\
+                               {\
+                                       clipped[j][k][0] = dpsoftrast.draw.post_array4f[j][e[p1]*4+0]*ifrac+dpsoftrast.draw.post_array4f[j][e[p2]*4+0]*frac;\
+                                       clipped[j][k][1] = dpsoftrast.draw.post_array4f[j][e[p1]*4+1]*ifrac+dpsoftrast.draw.post_array4f[j][e[p2]*4+1]*frac;\
+                                       clipped[j][k][2] = dpsoftrast.draw.post_array4f[j][e[p1]*4+2]*ifrac+dpsoftrast.draw.post_array4f[j][e[p2]*4+2]*frac;\
+                                       clipped[j][k][3] = dpsoftrast.draw.post_array4f[j][e[p1]*4+3]*ifrac+dpsoftrast.draw.post_array4f[j][e[p2]*4+3]*frac;\
+                               }\
                        }\
                        DPSOFTRAST_Draw_ProjectVertices(screen[k], clipped[DPSOFTRAST_ARRAY_POSITION][k], 1)
 #define CLIPPEDVERTEXCOPY(k,p1) \
-                       for (j = 0;j < numarrays;j++)\
+                       for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL;j++)\
                        {\
-                               clipped[j][k][0] = dpsoftrast.draw.post_array4f[j][e[p1]*4+0];\
-                               clipped[j][k][1] = dpsoftrast.draw.post_array4f[j][e[p1]*4+1];\
-                               clipped[j][k][2] = dpsoftrast.draw.post_array4f[j][e[p1]*4+2];\
-                               clipped[j][k][3] = dpsoftrast.draw.post_array4f[j][e[p1]*4+3];\
+                               if (arraymask[j])\
+                               {\
+                                       clipped[j][k][0] = dpsoftrast.draw.post_array4f[j][e[p1]*4+0];\
+                                       clipped[j][k][1] = dpsoftrast.draw.post_array4f[j][e[p1]*4+1];\
+                                       clipped[j][k][2] = dpsoftrast.draw.post_array4f[j][e[p1]*4+2];\
+                                       clipped[j][k][3] = dpsoftrast.draw.post_array4f[j][e[p1]*4+3];\
+                               }\
                        }\
                        screen[k][0] = dpsoftrast.draw.screencoord4f[e[p1]*4+0];\
                        screen[k][1] = dpsoftrast.draw.screencoord4f[e[p1]*4+1];\
@@ -2104,15 +2324,52 @@ void DPSOFTRAST_Draw_ProcessTriangles(int firstvertex, int numvertices, int numt
                // (which is basically 1/Z), which will be undone per-pixel
                // (multiplying by Z again) to get the perspective-correct array
                // values
-               for (j = 0;j < numarrays;j++)
+               for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL;j++)
                {
-                       for (k = 0;k < numpoints;k++)
+                       if (arraymask[j])
                        {
-                               w = screen[k][3];
-                               proj[j][k][0] = clipped[j][k][0] * w;
-                               proj[j][k][1] = clipped[j][k][1] * w;
-                               proj[j][k][2] = clipped[j][k][2] * w;
-                               proj[j][k][3] = clipped[j][k][3] * w;
+                               for (k = 0;k < numpoints;k++)
+                               {
+                                       w = screen[k][3];
+                                       proj[j][k][0] = clipped[j][k][0] * w;
+                                       proj[j][k][1] = clipped[j][k][1] * w;
+                                       proj[j][k][2] = clipped[j][k][2] * w;
+                                       proj[j][k][3] = clipped[j][k][3] * w;
+                               }
+                       }
+               }
+               // adjust texture LOD by texture density, in the simplest way possible...
+               mip_edge0xy[0] = screen[0][0] - screen[1][0];
+               mip_edge0xy[1] = screen[0][1] - screen[1][1];
+               mip_edge1xy[0] = screen[2][0] - screen[1][0];
+               mip_edge1xy[1] = screen[2][1] - screen[1][1];
+               mip_edge0xymul = 1.0f / (mip_edge0xy[0]*mip_edge0xy[0]+mip_edge0xy[1]*mip_edge0xy[1]);
+               mip_edge1xymul = 1.0f / (mip_edge1xy[0]*mip_edge1xy[0]+mip_edge1xy[1]*mip_edge1xy[1]);
+               for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
+               {
+                       texture = dpsoftrast.texbound[j];
+                       if (texture)
+                       {
+                               if (texture->filter <= DPSOFTRAST_TEXTURE_FILTER_LINEAR)
+                               {
+                                       mip[j] = 0;
+                                       continue;
+                               }
+                               // FIXME: use appropriate array for this texture!
+                               mip_edge0tc[0] = (clipped[DPSOFTRAST_ARRAY_TEXCOORD0][0][0] - clipped[DPSOFTRAST_ARRAY_TEXCOORD0][1][0]) * texture->mipmap[0][2];
+                               mip_edge0tc[1] = (clipped[DPSOFTRAST_ARRAY_TEXCOORD0][0][1] - clipped[DPSOFTRAST_ARRAY_TEXCOORD0][1][1]) * texture->mipmap[0][3];
+                               mip_edge1tc[0] = (clipped[DPSOFTRAST_ARRAY_TEXCOORD0][2][0] - clipped[DPSOFTRAST_ARRAY_TEXCOORD0][1][0]) * texture->mipmap[0][2];
+                               mip_edge1tc[1] = (clipped[DPSOFTRAST_ARRAY_TEXCOORD0][2][1] - clipped[DPSOFTRAST_ARRAY_TEXCOORD0][1][1]) * texture->mipmap[0][3];
+                               mip_edge0mip = (mip_edge0tc[0]*mip_edge0tc[0]+mip_edge0tc[1]*mip_edge0tc[1]) * mip_edge0xymul;
+                               mip_edge1mip = (mip_edge1tc[0]*mip_edge1tc[0]+mip_edge1tc[1]*mip_edge1tc[1]) * mip_edge1xymul;
+                               // this will be multiplied in the texturing routine by the texture resolution
+                               mipdensity = mip_edge0mip < mip_edge1mip ? mip_edge0mip : mip_edge1mip;
+                               y = (int)(log(mipdensity)/log(2) + 0.5f);
+                               if (y < 0)
+                                       y = 0;
+                               if (y > texture->mipmaps - 1)
+                                       y = texture->mipmaps - 1;
+                               mip[j] = y;
                        }
                }
                // iterate potential spans
@@ -2230,6 +2487,7 @@ void DPSOFTRAST_Draw_ProcessTriangles(int firstvertex, int numvertices, int numt
                        spanilength = 1.0f / (endxf - startxf);
                        startxlerp = startx - startxf;
                        span = &dpsoftrast.draw.spanqueue[dpsoftrast.draw.numspans++];
+                       memcpy(span->mip, mip, sizeof(span->mip));
                        span->start = y * width + startx;
                        span->length = endx - startx;
                        j = DPSOFTRAST_ARRAY_TOTAL;
@@ -2243,16 +2501,19 @@ void DPSOFTRAST_Draw_ProcessTriangles(int firstvertex, int numvertices, int numt
                                span->data[1][j][1] = screen[edge1p][1] * edge1yilerp + screen[edge1n][1] * edge1ylerp;
                                span->data[1][j][2] = screen[edge1p][2] * edge1yilerp + screen[edge1n][2] * edge1ylerp;
                                span->data[1][j][3] = screen[edge1p][3] * edge1yilerp + screen[edge1n][3] * edge1ylerp;
-                               for (j = 0;j < numarrays;j++)
+                               for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL;j++)
                                {
-                                       span->data[0][j][0] = proj[j][edge0p][0] * edge0yilerp + proj[j][edge0n][0] * edge0ylerp;
-                                       span->data[0][j][1] = proj[j][edge0p][1] * edge0yilerp + proj[j][edge0n][1] * edge0ylerp;
-                                       span->data[0][j][2] = proj[j][edge0p][2] * edge0yilerp + proj[j][edge0n][2] * edge0ylerp;
-                                       span->data[0][j][3] = proj[j][edge0p][3] * edge0yilerp + proj[j][edge0n][3] * edge0ylerp;
-                                       span->data[1][j][0] = proj[j][edge1p][0] * edge1yilerp + proj[j][edge1n][0] * edge1ylerp;
-                                       span->data[1][j][1] = proj[j][edge1p][1] * edge1yilerp + proj[j][edge1n][1] * edge1ylerp;
-                                       span->data[1][j][2] = proj[j][edge1p][2] * edge1yilerp + proj[j][edge1n][2] * edge1ylerp;
-                                       span->data[1][j][3] = proj[j][edge1p][3] * edge1yilerp + proj[j][edge1n][3] * edge1ylerp;
+                                       if (arraymask[j])
+                                       {
+                                               span->data[0][j][0] = proj[j][edge0p][0] * edge0yilerp + proj[j][edge0n][0] * edge0ylerp;
+                                               span->data[0][j][1] = proj[j][edge0p][1] * edge0yilerp + proj[j][edge0n][1] * edge0ylerp;
+                                               span->data[0][j][2] = proj[j][edge0p][2] * edge0yilerp + proj[j][edge0n][2] * edge0ylerp;
+                                               span->data[0][j][3] = proj[j][edge0p][3] * edge0yilerp + proj[j][edge0n][3] * edge0ylerp;
+                                               span->data[1][j][0] = proj[j][edge1p][0] * edge1yilerp + proj[j][edge1n][0] * edge1ylerp;
+                                               span->data[1][j][1] = proj[j][edge1p][1] * edge1yilerp + proj[j][edge1n][1] * edge1ylerp;
+                                               span->data[1][j][2] = proj[j][edge1p][2] * edge1yilerp + proj[j][edge1n][2] * edge1ylerp;
+                                               span->data[1][j][3] = proj[j][edge1p][3] * edge1yilerp + proj[j][edge1n][3] * edge1ylerp;
+                                       }
                                }
                        }
                        else
@@ -2265,16 +2526,19 @@ void DPSOFTRAST_Draw_ProcessTriangles(int firstvertex, int numvertices, int numt
                                span->data[1][j][1] = screen[edge0p][1] * edge0yilerp + screen[edge0n][1] * edge0ylerp;
                                span->data[1][j][2] = screen[edge0p][2] * edge0yilerp + screen[edge0n][2] * edge0ylerp;
                                span->data[1][j][3] = screen[edge0p][3] * edge0yilerp + screen[edge0n][3] * edge0ylerp;
-                               for (j = 0;j < numarrays;j++)
+                               for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL;j++)
                                {
-                                       span->data[0][j][0] = proj[j][edge1p][0] * edge1yilerp + proj[j][edge1n][0] * edge1ylerp;
-                                       span->data[0][j][1] = proj[j][edge1p][1] * edge1yilerp + proj[j][edge1n][1] * edge1ylerp;
-                                       span->data[0][j][2] = proj[j][edge1p][2] * edge1yilerp + proj[j][edge1n][2] * edge1ylerp;
-                                       span->data[0][j][3] = proj[j][edge1p][3] * edge1yilerp + proj[j][edge1n][3] * edge1ylerp;
-                                       span->data[1][j][0] = proj[j][edge0p][0] * edge0yilerp + proj[j][edge0n][0] * edge0ylerp;
-                                       span->data[1][j][1] = proj[j][edge0p][1] * edge0yilerp + proj[j][edge0n][1] * edge0ylerp;
-                                       span->data[1][j][2] = proj[j][edge0p][2] * edge0yilerp + proj[j][edge0n][2] * edge0ylerp;
-                                       span->data[1][j][3] = proj[j][edge0p][3] * edge0yilerp + proj[j][edge0n][3] * edge0ylerp;
+                                       if (arraymask[j])
+                                       {
+                                               span->data[0][j][0] = proj[j][edge1p][0] * edge1yilerp + proj[j][edge1n][0] * edge1ylerp;
+                                               span->data[0][j][1] = proj[j][edge1p][1] * edge1yilerp + proj[j][edge1n][1] * edge1ylerp;
+                                               span->data[0][j][2] = proj[j][edge1p][2] * edge1yilerp + proj[j][edge1n][2] * edge1ylerp;
+                                               span->data[0][j][3] = proj[j][edge1p][3] * edge1yilerp + proj[j][edge1n][3] * edge1ylerp;
+                                               span->data[1][j][0] = proj[j][edge0p][0] * edge0yilerp + proj[j][edge0n][0] * edge0ylerp;
+                                               span->data[1][j][1] = proj[j][edge0p][1] * edge0yilerp + proj[j][edge0n][1] * edge0ylerp;
+                                               span->data[1][j][2] = proj[j][edge0p][2] * edge0yilerp + proj[j][edge0n][2] * edge0ylerp;
+                                               span->data[1][j][3] = proj[j][edge0p][3] * edge0yilerp + proj[j][edge0n][3] * edge0ylerp;
+                                       }
                                }
                        }
                        // change data[1][n][] to be a data slope
@@ -2283,12 +2547,15 @@ void DPSOFTRAST_Draw_ProcessTriangles(int firstvertex, int numvertices, int numt
                        span->data[1][j][1] = (span->data[1][j][1] - span->data[0][j][1]) * spanilength;
                        span->data[1][j][2] = (span->data[1][j][2] - span->data[0][j][2]) * spanilength;
                        span->data[1][j][3] = (span->data[1][j][3] - span->data[0][j][3]) * spanilength;
-                       for (j = 0;j < numarrays;j++)
+                       for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL;j++)
                        {
-                               span->data[1][j][0] = (span->data[1][j][0] - span->data[0][j][0]) * spanilength;
-                               span->data[1][j][1] = (span->data[1][j][1] - span->data[0][j][1]) * spanilength;
-                               span->data[1][j][2] = (span->data[1][j][2] - span->data[0][j][2]) * spanilength;
-                               span->data[1][j][3] = (span->data[1][j][3] - span->data[0][j][3]) * spanilength;
+                               if (arraymask[j])
+                               {
+                                       span->data[1][j][0] = (span->data[1][j][0] - span->data[0][j][0]) * spanilength;
+                                       span->data[1][j][1] = (span->data[1][j][1] - span->data[0][j][1]) * spanilength;
+                                       span->data[1][j][2] = (span->data[1][j][2] - span->data[0][j][2]) * spanilength;
+                                       span->data[1][j][3] = (span->data[1][j][3] - span->data[0][j][3]) * spanilength;
+                               }
                        }
                        // adjust the data[0][n][] to be correct for the pixel centers
                        // this also handles horizontal clipping where a major part of the
@@ -2298,12 +2565,15 @@ void DPSOFTRAST_Draw_ProcessTriangles(int firstvertex, int numvertices, int numt
                        span->data[0][j][1] += span->data[1][j][1] * startxlerp;
                        span->data[0][j][2] += span->data[1][j][2] * startxlerp;
                        span->data[0][j][3] += span->data[1][j][3] * startxlerp;
-                       for (j = 0;j < numarrays;j++)
+                       for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL;j++)
                        {
-                               span->data[0][j][0] += span->data[1][j][0] * startxlerp;
-                               span->data[0][j][1] += span->data[1][j][1] * startxlerp;
-                               span->data[0][j][2] += span->data[1][j][2] * startxlerp;
-                               span->data[0][j][3] += span->data[1][j][3] * startxlerp;
+                               if (arraymask[j])
+                               {
+                                       span->data[0][j][0] += span->data[1][j][0] * startxlerp;
+                                       span->data[0][j][1] += span->data[1][j][1] * startxlerp;
+                                       span->data[0][j][2] += span->data[1][j][2] * startxlerp;
+                                       span->data[0][j][3] += span->data[1][j][3] * startxlerp;
+                               }
                        }
                        // to keep the shader routines from needing more than a small
                        // buffer for pixel intermediate data, we split long spans...
@@ -2326,12 +2596,15 @@ void DPSOFTRAST_Draw_ProcessTriangles(int firstvertex, int numvertices, int numt
                                span->data[0][j][1] += span->data[1][j][1] * DPSOFTRAST_DRAW_MAXSPANLENGTH;
                                span->data[0][j][2] += span->data[1][j][2] * DPSOFTRAST_DRAW_MAXSPANLENGTH;
                                span->data[0][j][3] += span->data[1][j][3] * DPSOFTRAST_DRAW_MAXSPANLENGTH;
-                               for (j = 0;j < numarrays;j++)
+                               for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL;j++)
                                {
-                                       span->data[0][j][0] += span->data[1][j][0] * DPSOFTRAST_DRAW_MAXSPANLENGTH;
-                                       span->data[0][j][1] += span->data[1][j][1] * DPSOFTRAST_DRAW_MAXSPANLENGTH;
-                                       span->data[0][j][2] += span->data[1][j][2] * DPSOFTRAST_DRAW_MAXSPANLENGTH;
-                                       span->data[0][j][3] += span->data[1][j][3] * DPSOFTRAST_DRAW_MAXSPANLENGTH;
+                                       if (arraymask[j])
+                                       {
+                                               span->data[0][j][0] += span->data[1][j][0] * DPSOFTRAST_DRAW_MAXSPANLENGTH;
+                                               span->data[0][j][1] += span->data[1][j][1] * DPSOFTRAST_DRAW_MAXSPANLENGTH;
+                                               span->data[0][j][2] += span->data[1][j][2] * DPSOFTRAST_DRAW_MAXSPANLENGTH;
+                                               span->data[0][j][3] += span->data[1][j][3] * DPSOFTRAST_DRAW_MAXSPANLENGTH;
+                                       }
                                }
                        }
                        // after all that, we have a span suitable for the pixel shader...
@@ -2386,11 +2659,22 @@ void DPSOFTRAST_Draw_DebugPoints(void)
 
 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
 {
+       unsigned char arraymask[DPSOFTRAST_ARRAY_TOTAL];
+       arraymask[0] = true;
+       arraymask[1] = dpsoftrast.fb_colorpixels[0] != NULL; // TODO: optimize (decide based on shadermode)
+       arraymask[2] = dpsoftrast.pointer_texcoordf[0] != NULL;
+       arraymask[3] = dpsoftrast.pointer_texcoordf[1] != NULL;
+       arraymask[4] = dpsoftrast.pointer_texcoordf[2] != NULL;
+       arraymask[5] = dpsoftrast.pointer_texcoordf[3] != NULL;
+       arraymask[6] = dpsoftrast.pointer_texcoordf[4] != NULL;
+       arraymask[7] = dpsoftrast.pointer_texcoordf[5] != NULL;
+       arraymask[8] = dpsoftrast.pointer_texcoordf[6] != NULL;
+       arraymask[9] = dpsoftrast.pointer_texcoordf[7] != NULL;
        DPSOFTRAST_Validate(DPSOFTRAST_VALIDATE_DRAW);
-       DPSOFTRAST_Draw_LoadVertices(firstvertex, numvertices, true, 1);
+       DPSOFTRAST_Draw_LoadVertices(firstvertex, numvertices, true);
        DPSOFTRAST_Draw_VertexShader();
        DPSOFTRAST_Draw_ProjectVertices(dpsoftrast.draw.screencoord4f, dpsoftrast.draw.post_array4f[DPSOFTRAST_ARRAY_POSITION], numvertices);
-       DPSOFTRAST_Draw_ProcessTriangles(firstvertex, numvertices, numtriangles, element3i, element3s, 3);
+       DPSOFTRAST_Draw_ProcessTriangles(firstvertex, numvertices, numtriangles, element3i, element3s, arraymask);
 }
 
 void DPSOFTRAST_Init(int width, int height, unsigned int *colorpixels, unsigned int *depthpixels)