]> git.xonotic.org Git - xonotic/darkplaces.git/blobdiff - dpsoftrast.c
UNMERGE
[xonotic/darkplaces.git] / dpsoftrast.c
index 072fd564466bc209f876be0aa463e75132c07cce..e8eb95706a1fbf7fd082d5fc9473d2f628399847 100644 (file)
@@ -3,12 +3,9 @@
 #define _USE_MATH_DEFINES
 #include <math.h>
 #include "quakedef.h"
+#include "thread.h"
 #include "dpsoftrast.h"
 
-#ifdef USE_SDL
-#define USE_THREADS
-#endif
-
 #ifndef __cplusplus
 typedef qboolean bool;
 #endif
@@ -17,51 +14,56 @@ typedef qboolean bool;
 #define ATOMIC_SIZE 32
 
 #ifdef SSE2_PRESENT
-       #if defined(__GNUC__)
+       #if defined(__APPLE__)
+               #include <libkern/OSAtomic.h>
+               #define ALIGN(var) var __attribute__((__aligned__(16)))
+               #define ATOMIC(var) var __attribute__((__aligned__(32)))
+               #define MEMORY_BARRIER (_mm_sfence())
+               #define ATOMIC_COUNTER volatile int32_t 
+               #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
+               #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
+               #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
+       #elif defined(__GNUC__)
                #define ALIGN(var) var __attribute__((__aligned__(16)))
                #define ATOMIC(var) var __attribute__((__aligned__(32)))
-               #ifdef USE_THREADS
-                       #define MEMORY_BARRIER (_mm_sfence())
-                       //(__sync_synchronize())
-                       #define ATOMIC_COUNTER volatile int
-                       #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
-                       #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
-                       #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
-               #endif
+               #define MEMORY_BARRIER (_mm_sfence())
+               //(__sync_synchronize())
+               #define ATOMIC_COUNTER volatile int
+               #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
+               #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
+               #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
        #elif defined(_MSC_VER)
                #define ALIGN(var) __declspec(align(16)) var
                #define ATOMIC(var) __declspec(align(32)) var
-               #ifdef USE_THREADS
-                       #define MEMORY_BARRIER (_mm_sfence())
-                       //(MemoryBarrier())
-                       #define ATOMIC_COUNTER volatile LONG
-                       #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
-                       #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
-                       #define ATOMIC_ADD(counter, val) (InterlockedExchangeAdd(&(counter), (val)))
-               #endif
-       #else
-               #undef USE_THREADS
-               #undef SSE2_PRESENT
+               #define MEMORY_BARRIER (_mm_sfence())
+               //(MemoryBarrier())
+               #define ATOMIC_COUNTER volatile LONG
+               #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
+               #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
+               #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
        #endif
 #endif
 
-#ifndef SSE2_PRESENT
-       #define ALIGN(var) var
-       #define ATOMIC(var) var
+#ifndef ALIGN
+#define ALIGN(var) var
 #endif
-
-#ifdef USE_THREADS
-#include <SDL.h>
-#include <SDL_thread.h>
-#else
-       #define MEMORY_BARRIER ((void)0)
-       #define ATOMIC_COUNTER int
-       #define ATOMIC_INCREMENT(counter) (++(counter))
-       #define ATOMIC_DECREMENT(counter) (--(counter))
-       #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
-       typedef void SDL_Thread;
-       typedef void SDL_cond;
-       typedef void SDL_mutex;
+#ifndef ATOMIC
+#define ATOMIC(var) var
+#endif
+#ifndef MEMORY_BARRIER
+#define MEMORY_BARRIER ((void)0)
+#endif
+#ifndef ATOMIC_COUNTER
+#define ATOMIC_COUNTER int
+#endif
+#ifndef ATOMIC_INCREMENT
+#define ATOMIC_INCREMENT(counter) (++(counter))
+#endif
+#ifndef ATOMIC_DECREMENT
+#define ATOMIC_DECREMENT(counter) (--(counter))
+#endif
+#ifndef ATOMIC_ADD
+#define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
 #endif
 
 #ifdef SSE2_PRESENT
@@ -179,7 +181,6 @@ typedef ALIGN(struct DPSOFTRAST_State_Span_s
        int triangle; // triangle this span was generated by
        int x; // framebuffer x coord
        int y; // framebuffer y coord
-       int length; // pixel count
        int startx; // usable range (according to pixelmask)
        int endx; // usable range (according to pixelmask)
        unsigned char *pixelmask; // true for pixels that passed depth test, false for others
@@ -205,13 +206,14 @@ typedef enum DPSOFTRAST_BLENDMODE_e
        DPSOFTRAST_BLENDMODE_MUL2,
        DPSOFTRAST_BLENDMODE_SUBALPHA,
        DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
+       DPSOFTRAST_BLENDMODE_INVADD,
        DPSOFTRAST_BLENDMODE_TOTAL
 }
 DPSOFTRAST_BLENDMODE;
 
 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
 {
-       SDL_Thread *thread;
+       void *thread;
        int index;
        
        int cullface;
@@ -243,7 +245,7 @@ typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
 
        // derived values (DPSOFTRAST_VALIDATE_FB)
        int fb_colormask;
-       int fb_clearscissor[4];
+       int fb_scissor[4];
        ALIGN(float fb_viewportcenter[4]);
        ALIGN(float fb_viewportscale[4]);
 
@@ -263,9 +265,9 @@ typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
 
        volatile bool waiting;
        volatile bool starving;
-       SDL_cond *waitcond;
-       SDL_cond *drawcond;
-       SDL_mutex *drawmutex;
+       void *waitcond;
+       void *drawcond;
+       void *drawmutex;
 
        int numspans;
        int numtriangles;
@@ -320,6 +322,7 @@ typedef ATOMIC(struct DPSOFTRAST_State_s
        // error reporting
        const char *errorstring;
 
+       bool usethreads;
        int interlace;
        int numthreads;
        DPSOFTRAST_State_Thread *threads;
@@ -365,10 +368,10 @@ static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
        if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
        if (y1 < 0) y1 = 0;
        if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
-       thread->fb_clearscissor[0] = x1;
-       thread->fb_clearscissor[1] = y1;
-       thread->fb_clearscissor[2] = x2 - x1;
-       thread->fb_clearscissor[3] = y2 - y1;
+       thread->fb_scissor[0] = x1;
+       thread->fb_scissor[1] = y1;
+       thread->fb_scissor[2] = x2 - x1;
+       thread->fb_scissor[3] = y2 - y1;
 
        DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
 }
@@ -386,7 +389,7 @@ static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
                {
                #define BLENDFUNC(sfactor, dfactor, blendmode) \
                        case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
-               BLENDFUNC(GL_SRC_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
+               BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
                default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
                }
        }
@@ -403,7 +406,7 @@ static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
                BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
                BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
                BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
-               BLENDFUNC(GL_SRC_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
+               BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
                default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
                }
        }
@@ -765,13 +768,12 @@ static void DPSOFTRAST_Draw_FlushThreads(void);
 
 static void DPSOFTRAST_Draw_SyncCommands(void)
 {
-       MEMORY_BARRIER;
+       if(dpsoftrast.usethreads) MEMORY_BARRIER;
        dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
 }
 
 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
 {
-#ifdef USE_THREADS
        DPSOFTRAST_State_Thread *thread;
        int i;
        int freecommand = dpsoftrast.commandpool.freecommand;
@@ -799,20 +801,17 @@ static void DPSOFTRAST_Draw_FreeCommandPool(int space)
                if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
                        break;
                thread = &dpsoftrast.threads[waitindex];
-               SDL_LockMutex(thread->drawmutex);
+               Thread_LockMutex(thread->drawmutex);
                if (thread->commandoffset != dpsoftrast.drawcommand)
                {
                        thread->waiting = true;
-                       if (thread->starving) SDL_CondSignal(thread->drawcond);
-                       SDL_CondWait(thread->waitcond, thread->drawmutex);
+                       if (thread->starving) Thread_CondSignal(thread->drawcond);
+                       Thread_CondWait(thread->waitcond, thread->drawmutex);
                        thread->waiting = false;
                }
-               SDL_UnlockMutex(thread->drawmutex);
+               Thread_UnlockMutex(thread->drawmutex);
        }
        dpsoftrast.commandpool.usedcommands = usedcommands;
-#else
-       DPSOFTRAST_Draw_FlushThreads();
-#endif
 }
 
 #define DPSOFTRAST_ALIGNCOMMAND(size) \
@@ -830,7 +829,10 @@ static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
                extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
        if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
        {
-               DPSOFTRAST_Draw_FreeCommandPool(size + extra);
+               if (dpsoftrast.usethreads)
+                       DPSOFTRAST_Draw_FreeCommandPool(size + extra);
+               else
+                       DPSOFTRAST_Draw_FlushThreads();
                freecommand = dpsoftrast.commandpool.freecommand;
                usedcommands = dpsoftrast.commandpool.usedcommands;
        }
@@ -900,10 +902,10 @@ static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, con
        unsigned int *p;
        unsigned int c;
        DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
-       x1 = thread->fb_clearscissor[0];
-       y1 = thread->fb_clearscissor[1];
-       x2 = thread->fb_clearscissor[0] + thread->fb_clearscissor[2];
-       y2 = thread->fb_clearscissor[1] + thread->fb_clearscissor[3];
+       x1 = thread->fb_scissor[0];
+       y1 = thread->fb_scissor[1];
+       x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
+       y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
        if (y1 < miny1) y1 = miny1;
        if (y2 > maxy2) y2 = maxy2;
        w = x2 - x1;
@@ -946,10 +948,10 @@ static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPS
        unsigned int *p;
        unsigned int c;
        DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
-       x1 = thread->fb_clearscissor[0];
-       y1 = thread->fb_clearscissor[1];
-       x2 = thread->fb_clearscissor[0] + thread->fb_clearscissor[2];
-       y2 = thread->fb_clearscissor[1] + thread->fb_clearscissor[3];
+       x1 = thread->fb_scissor[0];
+       y1 = thread->fb_scissor[1];
+       x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
+       y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
        if (y1 < miny1) y1 = miny1;
        if (y2 > maxy2) y2 = maxy2;
        w = x2 - x1;
@@ -1157,7 +1159,6 @@ void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockh
        int bx2 = blockx + blockwidth;
        int by2 = blocky + blockheight;
        int bw;
-       int bh;
        int x;
        int y;
        unsigned char *inpixels;
@@ -1169,7 +1170,6 @@ void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockh
        if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
        if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
        bw = bx2 - bx1;
-       bh = by2 - by1;
        inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
        if (dpsoftrast.bigendian)
        {
@@ -1223,8 +1223,7 @@ void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int s
        DPSOFTRAST_Texture *texture;
        texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
        if (mip < 0 || mip >= texture->mipmaps) return;
-       if (texture->binds)
-               DPSOFTRAST_Flush();
+       DPSOFTRAST_Flush();
        spixels = dpsoftrast.fb_colorpixels[0];
        swidth = dpsoftrast.fb_width;
        sheight = dpsoftrast.fb_height;
@@ -1247,8 +1246,9 @@ void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int s
        if (th > sh) th = sh;
        if (tw < 1 || th < 1)
                return;
+       sy1 = sheight - 1 - sy1;
        for (y = 0;y < th;y++)
-               memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 + y) * swidth + sx1), tw*4);
+               memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
        if (texture->mipmaps > 1)
                DPSOFTRAST_Texture_CalculateMipmaps(index);
 }
@@ -1365,7 +1365,7 @@ void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int
        {
                __m128 m0, m1, m2, m3;
                DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
-               command->index = index;
+               command->index = (DPSOFTRAST_UNIFORM)index;
                if (((size_t)v)&(ALIGN_SIZE-1))
                {
                        m0 = _mm_loadu_ps(v);
@@ -1761,11 +1761,9 @@ static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, __m128 minpos, __m12
        *endy = _mm_cvttss_si32(minproj)+1;
        return clipmask;
 }
-#endif
        
 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
 {
-#ifdef SSE2_PRESENT
        float *end = out4f + numitems*4;
        __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
        __m128 minpos, maxpos;
@@ -1808,12 +1806,10 @@ static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty,
                                        _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f),
                                        _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f));
        return 0;
-#endif
 }
 
 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
 {
-#ifdef SSE2_PRESENT
        static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
        __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
        float *end;
@@ -1863,11 +1859,12 @@ static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int
        if (starty && endy) 
                return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, m0, m1, m2, m3); 
        return 0;
-#endif
 }
+#endif
 
 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
 {
+#ifdef SSE2_PRESENT
        float *outf = dpsoftrast.post_array4f[outarray];
        const unsigned char *inb;
        int firstvertex = dpsoftrast.firstvertex;
@@ -1919,6 +1916,9 @@ static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
                break;
        }
        return outf;
+#else
+       return NULL;
+#endif
 }
 
 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
@@ -1931,17 +1931,25 @@ static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float
 #if 0
 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
 {
+#ifdef SSE2_PRESENT
        float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
        dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
        return data;
+#else
+       return NULL;
+#endif
 }
 #endif
 
 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
 {
+#ifdef SSE2_PRESENT
        float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
        dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
        return data;
+#else
+       return NULL;
+#endif
 }
 
 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
@@ -2125,6 +2133,21 @@ void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRA
                        pixel[x*4+3] = d[3];
                }
                break;
+       case DPSOFTRAST_BLENDMODE_INVADD:
+               for (x = startx;x < endx;x++)
+               {
+                       if (!pixelmask[x])
+                               continue;
+                       d[0] = (int)((255.0f-pixel[x*4+2])*in4f[x*4+0] + pixel[x*4+2]);if (d[0] > 255) d[0] = 255;
+                       d[1] = (int)((255.0f-pixel[x*4+1])*in4f[x*4+1] + pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
+                       d[2] = (int)((255.0f-pixel[x*4+0])*in4f[x*4+2] + pixel[x*4+0]);if (d[2] > 255) d[2] = 255;
+                       d[3] = (int)((255.0f-pixel[x*4+3])*in4f[x*4+3] + pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
+                       pixel[x*4+0] = d[0];
+                       pixel[x*4+1] = d[1];
+                       pixel[x*4+2] = d[2];
+                       pixel[x*4+3] = d[3];
+               }
+               break;
        }
 }
 
@@ -2171,7 +2194,7 @@ void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPS
                break;
        case DPSOFTRAST_BLENDMODE_ALPHA:
        #define FINISHBLEND(blend2, blend1) \
-               for (x = startx;x + 2 <= endx;x += 2) \
+               for (x = startx;x + 1 < endx;x += 2) \
                { \
                        __m128i src, dst; \
                        switch (*(const unsigned short*)&pixelmask[x]) \
@@ -2259,6 +2282,13 @@ void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPS
                        dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
                });
                break;
+       case DPSOFTRAST_BLENDMODE_INVADD:
+               FINISHBLEND({
+                       dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
+               }, {
+                       dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
+               });
+               break;
        }
 #endif
 }
@@ -2469,7 +2499,7 @@ void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread,
        // if no texture is bound, just fill it with white
        if (!texture)
        {
-               memset(out4ub + startx*4, 255, span->length*4);
+               memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
                return;
        }
        mip = triangle->mip[texunitindex];
@@ -2728,7 +2758,7 @@ void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread,
 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
 {
        // TODO: IMPLEMENT
-       memset(out4ub, 255, span->length*4);
+       memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
 }
 
 float DPSOFTRAST_SampleShadowmap(const float *vector)
@@ -2963,19 +2993,19 @@ void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRI
 #ifdef SSE2_PRESENT
        int x, startx = span->startx, endx = span->endx;
        __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
-       localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
+       localcolor = _mm_packs_epi32(localcolor, localcolor);
        for (x = startx;x+2 <= endx;x+=2)
        {
                __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
                __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
-               pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
+               pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
                _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
        }
        if (x < endx)
        {
                __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
                __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
-               pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
+               pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
                *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
        }
 #endif
@@ -3028,7 +3058,7 @@ void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle
 #ifdef SSE2_PRESENT
        int x, startx = span->startx, endx = span->endx;
        __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
-       tint = _mm_shuffle_epi32(_mm_packs_epi32(tint, tint), _MM_SHUFFLE(1, 0, 1, 0));
+       tint = _mm_packs_epi32(tint, tint);
        for (x = startx;x+2 <= endx;x+=2)
        {
                __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
@@ -3074,7 +3104,7 @@ void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle *
 #ifdef SSE2_PRESENT
        int x, startx = span->startx, endx = span->endx;
        __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
-       localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
+       localcolor = _mm_packs_epi32(localcolor, localcolor);
        blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
        for (x = startx;x+2 <= endx;x+=2)
        {
@@ -3144,7 +3174,7 @@ void DPSOFTRAST_VertexShader_PostProcess(void)
 {
        DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
        DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
-       DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
+       DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
 }
 
 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
@@ -3185,7 +3215,7 @@ void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, con
        float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
        unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
        DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
-       memset(buffer_FragColorbgra8, 0, span->length*4);
+       memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
        DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
 }
 
@@ -3199,25 +3229,44 @@ void DPSOFTRAST_VertexShader_FlatColor(void)
 
 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
 {
+#ifdef SSE2_PRESENT
+       unsigned char * RESTRICT pixelmask = span->pixelmask;
+       unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
        int x, startx = span->startx, endx = span->endx;
-       int Color_Ambienti[4];
+       __m128i Color_Ambientm;
        float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
        unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
        unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
-       Color_Ambienti[2] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0]*256.0f);
-       Color_Ambienti[1] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1]*256.0f);
-       Color_Ambienti[0] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2]*256.0f);
-       Color_Ambienti[3] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]        *256.0f);
        DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
        DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
+       if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
+               pixel = buffer_FragColorbgra8;
+       Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
+       Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
+       Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
+       Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
        for (x = startx;x < endx;x++)
        {
-               buffer_FragColorbgra8[x*4+0] = (buffer_texture_colorbgra8[x*4+0] * Color_Ambienti[0])>>8;
-               buffer_FragColorbgra8[x*4+1] = (buffer_texture_colorbgra8[x*4+1] * Color_Ambienti[1])>>8;
-               buffer_FragColorbgra8[x*4+2] = (buffer_texture_colorbgra8[x*4+2] * Color_Ambienti[2])>>8;
-               buffer_FragColorbgra8[x*4+3] = (buffer_texture_colorbgra8[x*4+3] * Color_Ambienti[3])>>8;
+               __m128i color, pix;
+               if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
+               {
+                       __m128i pix2;
+                       color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
+                       pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
+                       pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
+                       _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
+                       x += 3;
+                       continue;
+               }
+               if (!pixelmask[x])
+                       continue;
+               color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
+               pix = _mm_mulhi_epu16(Color_Ambientm, color);
+               *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
        }
-       DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
+       if (pixel == buffer_FragColorbgra8)
+               DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
+#endif
 }
 
 
@@ -3407,21 +3456,23 @@ void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPS
        float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
        unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
        DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
-       memset(buffer_FragColorbgra8, 0, span->length*4);
+       memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
        DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
 }
 
 
 
+void DPSOFTRAST_VertexShader_LightDirection(void);
 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
 {
-       DPSOFTRAST_VertexShader_Lightmap();
+       DPSOFTRAST_VertexShader_LightDirection();
+       DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
 }
 
+void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
 {
-       DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
-       // TODO: IMPLEMENT
+       DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
 }
 
 
@@ -3482,20 +3533,20 @@ void DPSOFTRAST_VertexShader_LightDirection(void)
                LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
                LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
                LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
-               dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
-               dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
-               dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
-               dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
+               dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
+               dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
+               dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
+               dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
                EyeVectorModelSpace[0] = EyePosition[0] - position[0];
                EyeVectorModelSpace[1] = EyePosition[1] - position[1];
                EyeVectorModelSpace[2] = EyePosition[2] - position[2];
                EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
                EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
                EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
-               dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
-               dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
-               dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
-               dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
+               dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
+               dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
+               dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
+               dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
        }
        DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
 }
@@ -3528,6 +3579,8 @@ void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, cons
        unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
        unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
        unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
+       unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
+       unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
        unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
        int x, startx = span->startx, endx = span->endx;
        float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
@@ -3535,11 +3588,18 @@ void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, cons
        float LightVectorslope[4];
        float EyeVectordata[4];
        float EyeVectorslope[4];
+       float VectorSdata[4];
+       float VectorSslope[4];
+       float VectorTdata[4];
+       float VectorTslope[4];
+       float VectorRdata[4];
+       float VectorRslope[4];
        float z;
        float diffusetex[4];
        float glosstex[4];
        float surfacenormal[4];
        float lightnormal[4];
+       float lightnormal_modelspace[4];
        float eyenormal[4];
        float specularnormal[4];
        float diffuse;
@@ -3583,15 +3643,25 @@ void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, cons
                LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
                LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
                LightColor[3] = 0.0f;
-               DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
+               DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
                DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
                Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
                Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
                Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
                Color_Specular[3] = 0.0f;
                SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
-               DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
+               DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
                DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
+
+               if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
+               {
+                       DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
+                       DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
+                       DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
+                       DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
+                       DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
+               }
+
                for (x = startx;x < endx;x++)
                {
                        z = buffer_z[x];
@@ -3615,10 +3685,46 @@ void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, cons
                        surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
                        DPSOFTRAST_Vector3Normalize(surfacenormal);
 
-                       lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
-                       lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
-                       lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
-                       DPSOFTRAST_Vector3Normalize(lightnormal);
+                       if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
+                       {
+                               // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
+                               lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
+                               lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
+                               lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
+
+                               // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
+                               lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
+                                              + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
+                                              + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
+
+                               // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
+                               lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
+                                              + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
+                                              + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
+
+                               // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
+                               lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
+                                              + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
+                                              + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
+
+                               // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
+                               DPSOFTRAST_Vector3Normalize(lightnormal);
+
+                               // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
+                               {
+                                       float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
+                                       LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
+                                       LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
+                                       LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
+                               }
+                       }
+                       else
+                       {
+                               lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
+                               lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
+                               lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
+                               DPSOFTRAST_Vector3Normalize(lightnormal);
+                       }
 
                        eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
                        eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
@@ -3647,6 +3753,7 @@ void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, cons
                                d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
                                d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
                        }
+
                        buffer_FragColorbgra8[x*4+0] = d[0];
                        buffer_FragColorbgra8[x*4+1] = d[1];
                        buffer_FragColorbgra8[x*4+2] = d[2];
@@ -3663,8 +3770,18 @@ void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, cons
                LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
                LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
                LightColor[3] = 0.0f;
-               DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
+               DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
                DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
+
+               if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
+               {
+                       DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
+                       DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
+                       DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
+                       DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
+                       DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
+               }
+
                for (x = startx;x < endx;x++)
                {
                        z = buffer_z[x];
@@ -3677,10 +3794,46 @@ void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, cons
                        surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
                        DPSOFTRAST_Vector3Normalize(surfacenormal);
 
-                       lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
-                       lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
-                       lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
-                       DPSOFTRAST_Vector3Normalize(lightnormal);
+                       if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
+                       {
+                               // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
+                               lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
+                               lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
+                               lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
+
+                               // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
+                               lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
+                                              + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
+                                              + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
+
+                               // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
+                               lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
+                                              + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
+                                              + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
+
+                               // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
+                               lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
+                                              + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
+                                              + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
+
+                               // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
+                               DPSOFTRAST_Vector3Normalize(lightnormal);
+
+                               // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
+                               {
+                                       float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
+                                       LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
+                                       LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
+                                       LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
+                               }
+                       }
+                       else
+                       {
+                               lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
+                               lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
+                               lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
+                               DPSOFTRAST_Vector3Normalize(lightnormal);
+                       }
 
                        diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
                        if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
@@ -4090,7 +4243,7 @@ void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DP
        float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
        unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
        DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
-       memset(buffer_FragColorbgra8, 0, span->length*4);
+       memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
        DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
 }
 
@@ -4108,7 +4261,7 @@ void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTR
        float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
        unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
        DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
-       memset(buffer_FragColorbgra8, 0, span->length*4);
+       memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
        DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
 }
 
@@ -4125,7 +4278,7 @@ void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPS
        float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
        unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
        DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
-       memset(buffer_FragColorbgra8, 0, span->length*4);
+       memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
        DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
 }
 
@@ -4142,7 +4295,7 @@ void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, co
        float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
        unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
        DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
-       memset(buffer_FragColorbgra8, 0, span->length*4);
+       memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
        DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
 }
 
@@ -4159,7 +4312,7 @@ void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread,
        float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
        unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
        DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
-       memset(buffer_FragColorbgra8, 0, span->length*4);
+       memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
        DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
 }
 
@@ -4184,15 +4337,15 @@ static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COU
        {2, DPSOFTRAST_VertexShader_VertexColor,                    DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
        {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
        {2, DPSOFTRAST_VertexShader_FakeLight,                      DPSOFTRAST_PixelShader_FakeLight,                      {~0}, {~0}},
-       {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace,   {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
+       {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace,   {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
        {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
-       {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,                 {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
+       {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,                 {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
        {2, DPSOFTRAST_VertexShader_LightSource,                    DPSOFTRAST_PixelShader_LightSource,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
        {2, DPSOFTRAST_VertexShader_Refraction,                     DPSOFTRAST_PixelShader_Refraction,                     {~0}},
        {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                          {~0}},
        {2, DPSOFTRAST_VertexShader_ShowDepth,                      DPSOFTRAST_PixelShader_ShowDepth,                      {~0}},
        {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,               {~0}},
-       {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}}
+       {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}},
 };
 
 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
@@ -4223,23 +4376,23 @@ void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
                        depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
                        depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
                        depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
+                       startx = span->startx;
+                       endx = span->endx;
                        switch(thread->fb_depthfunc)
                        {
                        default:
-                       case GL_ALWAYS:  for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = true; break;
-                       case GL_LESS:    for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
-                       case GL_LEQUAL:  for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
-                       case GL_EQUAL:   for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
-                       case GL_GEQUAL:  for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
-                       case GL_GREATER: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
-                       case GL_NEVER:   for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = false; break;
+                       case GL_ALWAYS:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
+                       case GL_LESS:    for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
+                       case GL_LEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
+                       case GL_EQUAL:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
+                       case GL_GEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
+                       case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
+                       case GL_NEVER:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
                        }
                        //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
-                       //for (x = 0;x < span->length;x++)
+                       //for (x = startx;x < endx;x++)
                        //      colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
                        // if there is no color buffer, skip pixel shader
-                       startx = 0;
-                       endx = span->length;
                        while (startx < endx && !pixelmask[startx])
                                startx++;
                        while (endx > startx && !pixelmask[endx-1])
@@ -4265,10 +4418,8 @@ void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
                        // if there is no color buffer, skip pixel shader
                        if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
                        {
-                               memset(pixelmask, 1, span->length);
+                               memset(pixelmask + span->startx, 1, span->endx - span->startx);
                                span->pixelmask = pixelmask;
-                               span->startx = 0;
-                               span->endx = span->length;
                                DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
                        }
                }
@@ -4282,11 +4433,8 @@ static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAS
 {
 #ifdef SSE2_PRESENT
        int cullface = thread->cullface;
-       int width = dpsoftrast.fb_width;
-       int miny1 = thread->miny1;
-       int maxy1 = thread->maxy1;
-       int miny2 = thread->miny2;
-       int maxy2 = thread->maxy2;
+       int minx, maxx, miny, maxy;
+       int miny1, maxy1, miny2, maxy2;
        __m128i fbmin, fbmax;
        __m128 viewportcenter, viewportscale;
        int firstvertex = command->firstvertex;
@@ -4310,6 +4458,13 @@ static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAS
        __m128 screen[4];
        DPSOFTRAST_State_Triangle *triangle;
        DPSOFTRAST_Texture *texture;
+       DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
+       miny = thread->fb_scissor[1];
+       maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
+       miny1 = bound(miny, thread->miny1, maxy);
+       maxy1 = bound(miny, thread->maxy1, maxy);
+       miny2 = bound(miny, thread->miny2, maxy);
+       maxy2 = bound(miny, thread->maxy2, maxy);
        if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
        {
                if (!ATOMIC_DECREMENT(command->refcount))
@@ -4319,9 +4474,10 @@ static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAS
                }
                return;
        }
-       DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
-       fbmin = _mm_setr_epi16(0, miny1, 0, miny1, 0, miny1, 0, miny1);
-       fbmax = _mm_sub_epi16(_mm_setr_epi16(width, maxy2, width, maxy2, width, maxy2, width, maxy2), _mm_set1_epi16(1));
+       minx = thread->fb_scissor[0];
+       maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
+       fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
+       fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
        viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
        viewportscale = _mm_load_ps(thread->fb_viewportscale);
        screen[3] = _mm_setzero_ps();
@@ -4536,9 +4692,9 @@ static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAS
                                attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
                                attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
                                attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
-                               _mm_stream_ps(triangle->attribs[k][0], attribxslope);
-                               _mm_stream_ps(triangle->attribs[k][1], attribyslope);
-                               _mm_stream_ps(triangle->attribs[k][2], attriborigin);
+                               _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
+                               _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
+                               _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
                                if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
                                {
                                        mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
@@ -4625,38 +4781,38 @@ static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAS
                        ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
                        nexty = _mm_extract_epi16(ycc, 0);
                        if (nexty >= bandy) nexty = bandy-1;
-                       if (_mm_ucomigt_ss(_mm_max_ss(screen[edge0n], screen[edge0p]), _mm_min_ss(screen[edge1n], screen[edge1p])))
-                       {
-                               int tmp = edge0n;
-                               edge0n = edge1n;
-                               edge1n = tmp;
-                               tmp = edge0p;
-                               edge0p = edge1p;
-                               edge1p = tmp;
-                       }
                        xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
                        xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
                        xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
                                                                _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
                        xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
+                       if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
+                       {
+                               xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
+                               xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
+                       }
                        for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
                        {
                                int startx, endx, offset;
                                startx = _mm_cvtss_si32(xcoords);
                                endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
-                               if (startx < 0) startx = 0;
-                               if (endx > dpsoftrast.fb_width) endx = dpsoftrast.fb_width;
+                               if (startx < minx) 
+                               {
+                                       if (startx < 0) startx = 0;
+                                       startx += (minx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
+                               }
+                               if (endx > maxx) endx = maxx;
                                if (startx >= endx) continue;
-                               for (offset = startx; offset < endx;)
+                               for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
                                {
                                        DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
                                        span->triangle = thread->numtriangles;
                                        span->x = offset;
                                        span->y = y;
-                                       span->length = endx - offset;
-                                       if (span -> length > DPSOFTRAST_DRAW_MAXSPANLENGTH)
-                                               span -> length = DPSOFTRAST_DRAW_MAXSPANLENGTH;
-                                       offset += span->length;
+                                       span->startx = max(minx - offset, 0);
+                                       span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
+                                       if (span->startx >= span->endx)
+                                               continue; 
                                        if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
                                                DPSOFTRAST_Draw_ProcessSpans(thread);
                                }
@@ -4764,20 +4920,21 @@ void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles
        command->clipped = dpsoftrast.drawclipped;
        command->refcount = dpsoftrast.numthreads;
 
-#ifdef USE_THREADS
-       DPSOFTRAST_Draw_SyncCommands();
+       if (dpsoftrast.usethreads)
        {
                int i;
+               DPSOFTRAST_Draw_SyncCommands();
                for (i = 0; i < dpsoftrast.numthreads; i++)
                {
                        DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
                        if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
-                               SDL_CondSignal(thread->drawcond);
+                               Thread_CondSignal(thread->drawcond);
                }
        }
-#else
-       DPSOFTRAST_Draw_FlushThreads();
-#endif
+       else
+       {
+               DPSOFTRAST_Draw_FlushThreads();
+       }
 }
  
 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
@@ -4833,7 +4990,6 @@ static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, i
        thread->commandoffset = commandoffset;
 }
 
-#ifdef USE_THREADS
 static int DPSOFTRAST_Draw_Thread(void *data)
 {
        DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
@@ -4845,58 +5001,62 @@ static int DPSOFTRAST_Draw_Thread(void *data)
                }
                else 
                {
-                       SDL_LockMutex(thread->drawmutex);
+                       Thread_LockMutex(thread->drawmutex);
                        if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
                        {
-                               if (thread->waiting) SDL_CondSignal(thread->waitcond);
+                               if (thread->waiting) Thread_CondSignal(thread->waitcond);
                                thread->starving = true;
-                               SDL_CondWait(thread->drawcond, thread->drawmutex);
+                               Thread_CondWait(thread->drawcond, thread->drawmutex);
                                thread->starving = false;
                        }
-                       SDL_UnlockMutex(thread->drawmutex);
+                       Thread_UnlockMutex(thread->drawmutex);
                }
        }   
        return 0;
 }
-#endif
 
 static void DPSOFTRAST_Draw_FlushThreads(void)
 {
        DPSOFTRAST_State_Thread *thread;
        int i;
        DPSOFTRAST_Draw_SyncCommands();
-#ifdef USE_THREADS
-       for (i = 0; i < dpsoftrast.numthreads; i++)
+       if (dpsoftrast.usethreads) 
        {
-               thread = &dpsoftrast.threads[i];
-               if (thread->commandoffset != dpsoftrast.drawcommand)
+               for (i = 0; i < dpsoftrast.numthreads; i++)
                {
-                       SDL_LockMutex(thread->drawmutex);
-                       if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
-                               SDL_CondSignal(thread->drawcond);
-                       SDL_UnlockMutex(thread->drawmutex);
+                       thread = &dpsoftrast.threads[i];
+                       if (thread->commandoffset != dpsoftrast.drawcommand)
+                       {
+                               Thread_LockMutex(thread->drawmutex);
+                               if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
+                                       Thread_CondSignal(thread->drawcond);
+                               Thread_UnlockMutex(thread->drawmutex);
+                       }
                }
-       }
-#endif                 
-       for (i = 0; i < dpsoftrast.numthreads; i++)
-       {
-               thread = &dpsoftrast.threads[i];
-#ifdef USE_THREADS
-               if (thread->commandoffset != dpsoftrast.drawcommand)
+               for (i = 0; i < dpsoftrast.numthreads; i++)
                {
-                       SDL_LockMutex(thread->drawmutex);
+                       thread = &dpsoftrast.threads[i];
                        if (thread->commandoffset != dpsoftrast.drawcommand)
                        {
-                               thread->waiting = true;
-                               SDL_CondWait(thread->waitcond, thread->drawmutex);
-                               thread->waiting = false;
+                               Thread_LockMutex(thread->drawmutex);
+                               if (thread->commandoffset != dpsoftrast.drawcommand)
+                               {
+                                       thread->waiting = true;
+                                       Thread_CondWait(thread->waitcond, thread->drawmutex);
+                                       thread->waiting = false;
+                               }
+                               Thread_UnlockMutex(thread->drawmutex);
                        }
-                       SDL_UnlockMutex(thread->drawmutex);
                }
-#else
-               if (thread->commandoffset != dpsoftrast.drawcommand)
-                       DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
-#endif
+       }
+       else
+       {
+               for (i = 0; i < dpsoftrast.numthreads; i++)
+               {
+                       thread = &dpsoftrast.threads[i];
+                       if (thread->commandoffset != dpsoftrast.drawcommand)
+                               DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
+               }
        }
        dpsoftrast.commandpool.usedcommands = 0;
 }
@@ -4911,7 +5071,7 @@ void DPSOFTRAST_Finish(void)
        DPSOFTRAST_Flush();
 }
 
-void DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
+int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
 {
        int i;
        union
@@ -4942,12 +5102,9 @@ void DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsig
        dpsoftrast.color[1] = 1;
        dpsoftrast.color[2] = 1;
        dpsoftrast.color[3] = 1;
-       dpsoftrast.interlace = bound(0, interlace, 1);
-#ifdef USE_THREADS
-       dpsoftrast.numthreads = bound(1, numthreads, 64);
-#else
-       dpsoftrast.numthreads = 1;
-#endif
+       dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
+       dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
+       dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
        dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
        for (i = 0; i < dpsoftrast.numthreads; i++)
        {
@@ -4997,41 +5154,40 @@ void DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsig
                thread->commandoffset = 0;
                thread->waiting = false;
                thread->starving = false;
-#ifdef USE_THREADS
-               thread->waitcond = SDL_CreateCond();
-               thread->drawcond = SDL_CreateCond();
-               thread->drawmutex = SDL_CreateMutex();
-#endif
-
+          
                thread->validate = -1;
                DPSOFTRAST_Validate(thread, -1);
-#ifdef USE_THREADS
-               thread->thread = SDL_CreateThread(DPSOFTRAST_Draw_Thread, thread);
-#endif
+               if (dpsoftrast.usethreads)
+               {
+                       thread->waitcond = Thread_CreateCond();
+                       thread->drawcond = Thread_CreateCond();
+                       thread->drawmutex = Thread_CreateMutex();
+                       thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
+               }
        }
+       return 0;
 }
 
 void DPSOFTRAST_Shutdown(void)
 {
        int i;
-#ifdef USE_THREADS
-       if (dpsoftrast.numthreads > 0)
+       if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
        {
                DPSOFTRAST_State_Thread *thread;
                for (i = 0; i < dpsoftrast.numthreads; i++)
                {
                        thread = &dpsoftrast.threads[i];
-                       SDL_LockMutex(thread->drawmutex);
+                       Thread_LockMutex(thread->drawmutex);
                        thread->index = -1;
-                       SDL_CondSignal(thread->drawcond);
-                       SDL_UnlockMutex(thread->drawmutex);
-                       SDL_WaitThread(thread->thread, NULL);
-                       SDL_DestroyCond(thread->waitcond);
-                       SDL_DestroyCond(thread->drawcond);
-                       SDL_DestroyMutex(thread->drawmutex);
+                       Thread_CondSignal(thread->drawcond);
+                       Thread_UnlockMutex(thread->drawmutex);
+                       Thread_WaitThread(thread->thread, 0);
+                       Thread_DestroyCond(thread->waitcond);
+                       Thread_DestroyCond(thread->drawcond);
+                       Thread_DestroyMutex(thread->drawmutex);
                }
        }
-#endif
        for (i = 0;i < dpsoftrast.texture_end;i++)
                if (dpsoftrast.texture[i].bytes)
                        MM_FREE(dpsoftrast.texture[i].bytes);