typo fix

[xonotic/darkplaces.git] / dpsoftrast.c
diff --git a/dpsoftrast.c b/dpsoftrast.c

index d6760f0b67b2ea130e0e2a55fd6cdcf01e430fcf..216d628cecd5199159c2da71a06d4fc9899675e4 100644 (file)
--- a/dpsoftrast.c
+++ b/dpsoftrast.c
@@ -3,17 +3,9 @@
  #define _USE_MATH_DEFINES
  #include <math.h>
  #include "quakedef.h"
  #define _USE_MATH_DEFINES
  #include <math.h>
  #include "quakedef.h"
+#include "thread.h"
  #include "dpsoftrast.h"
  
  #include "dpsoftrast.h"
  
-#ifdef USE_SDL
-#define USE_THREADS
-#endif
-
-#ifdef USE_THREADS
-#include <SDL.h>
-#include <SDL_thread.h>
-#endif
-
  #ifndef __cplusplus
  typedef qboolean bool;
  #endif
  #ifndef __cplusplus
  typedef qboolean bool;
  #endif
@@ -22,45 +14,56 @@ typedef qboolean bool;
  #define ATOMIC_SIZE 32
  
  #ifdef SSE2_PRESENT
  #define ATOMIC_SIZE 32
  
  #ifdef SSE2_PRESENT
-       #if defined(__GNUC__)
+       #if defined(__APPLE__)
+               #include <libkern/OSAtomic.h>
                 #define ALIGN(var) var __attribute__((__aligned__(16)))
                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
                 #define ALIGN(var) var __attribute__((__aligned__(16)))
                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
-               #ifdef USE_THREADS
-                       #define MEMORY_BARRIER (_mm_sfence())
-                       //(__sync_synchronize())
-                       #define ATOMIC_COUNTER volatile int
-                       #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
-                       #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
-                       #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
-               #endif
+               #define MEMORY_BARRIER (_mm_sfence())
+               #define ATOMIC_COUNTER volatile int32_t 
+               #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
+               #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
+               #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
+       #elif defined(__GNUC__)
+               #define ALIGN(var) var __attribute__((__aligned__(16)))
+               #define ATOMIC(var) var __attribute__((__aligned__(32)))
+               #define MEMORY_BARRIER (_mm_sfence())
+               //(__sync_synchronize())
+               #define ATOMIC_COUNTER volatile int
+               #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
+               #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
+               #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
         #elif defined(_MSC_VER)
                 #define ALIGN(var) __declspec(align(16)) var
                 #define ATOMIC(var) __declspec(align(32)) var
         #elif defined(_MSC_VER)
                 #define ALIGN(var) __declspec(align(16)) var
                 #define ATOMIC(var) __declspec(align(32)) var
-               #ifdef USE_THREADS
-                       #define MEMORY_BARRIER (_mm_sfence())
-                       //(MemoryBarrier())
-                       #define ATOMIC_COUNTER volatile LONG
-                       #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
-                       #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
-                       #define ATOMIC_ADD(counter, val) (InterlockedExchangeAdd(&(counter), (val)))
-               #endif
-       #else
-               #undef USE_THREADS
-               #undef SSE2_PRESENT
+               #define MEMORY_BARRIER (_mm_sfence())
+               //(MemoryBarrier())
+               #define ATOMIC_COUNTER volatile LONG
+               #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
+               #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
+               #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
         #endif
  #endif
  
         #endif
  #endif
  
-#ifndef SSE2_PRESENT
-       #define ALIGN(var) var
-       #define ATOMIC(var) var
+#ifndef ALIGN
+#define ALIGN(var) var
  #endif
  #endif
-
-#ifndef USE_THREADS
-       #define MEMORY_BARRIER ((void)0)
-       #define ATOMIC_COUNTER int
-       #define ATOMIC_INCREMENT(counter) (++(counter))
-       #define ATOMIC_DECREMENT(counter) (--(counter))
-       #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
+#ifndef ATOMIC
+#define ATOMIC(var) var
+#endif
+#ifndef MEMORY_BARRIER
+#define MEMORY_BARRIER ((void)0)
+#endif
+#ifndef ATOMIC_COUNTER
+#define ATOMIC_COUNTER int
+#endif
+#ifndef ATOMIC_INCREMENT
+#define ATOMIC_INCREMENT(counter) (++(counter))
+#endif
+#ifndef ATOMIC_DECREMENT
+#define ATOMIC_DECREMENT(counter) (--(counter))
+#endif
+#ifndef ATOMIC_ADD
+#define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
  #endif
  
  #ifdef SSE2_PRESENT
  #endif
  
  #ifdef SSE2_PRESENT
@@ -71,7 +74,7 @@ typedef qboolean bool;
  static void *MM_CALLOC(size_t nmemb, size_t size)
  {
         void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
  static void *MM_CALLOC(size_t nmemb, size_t size)
  {
         void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
-       if(ptr != NULL) memset(ptr, 0, nmemb*size);
+       if (ptr != NULL) memset(ptr, 0, nmemb*size);
         return ptr;
  }
  
         return ptr;
  }
  
@@ -178,7 +181,6 @@ typedef ALIGN(struct DPSOFTRAST_State_Span_s
         int triangle; // triangle this span was generated by
         int x; // framebuffer x coord
         int y; // framebuffer y coord
         int triangle; // triangle this span was generated by
         int x; // framebuffer x coord
         int y; // framebuffer y coord
-       int length; // pixel count
         int startx; // usable range (according to pixelmask)
         int endx; // usable range (according to pixelmask)
         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
         int startx; // usable range (according to pixelmask)
         int endx; // usable range (according to pixelmask)
         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
@@ -204,15 +206,14 @@ typedef enum DPSOFTRAST_BLENDMODE_e
         DPSOFTRAST_BLENDMODE_MUL2,
         DPSOFTRAST_BLENDMODE_SUBALPHA,
         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
         DPSOFTRAST_BLENDMODE_MUL2,
         DPSOFTRAST_BLENDMODE_SUBALPHA,
         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
+       DPSOFTRAST_BLENDMODE_INVADD,
         DPSOFTRAST_BLENDMODE_TOTAL
  }
  DPSOFTRAST_BLENDMODE;
  
  typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
  {
         DPSOFTRAST_BLENDMODE_TOTAL
  }
  DPSOFTRAST_BLENDMODE;
  
  typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
  {
-#ifdef USE_THREADS
-       SDL_Thread *thread;
-#endif
+       void *thread;
         int index;
         
         int cullface;
         int index;
         
         int cullface;
@@ -244,7 +245,7 @@ typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
  
         // derived values (DPSOFTRAST_VALIDATE_FB)
         int fb_colormask;
  
         // derived values (DPSOFTRAST_VALIDATE_FB)
         int fb_colormask;
-       int fb_clearscissor[4];
+       int fb_scissor[4];
         ALIGN(float fb_viewportcenter[4]);
         ALIGN(float fb_viewportscale[4]);
  
         ALIGN(float fb_viewportcenter[4]);
         ALIGN(float fb_viewportscale[4]);
  
@@ -254,12 +255,19 @@ typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
         int fb_blendmode;
  
         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
         int fb_blendmode;
  
-       ATOMIC(int commandoffset);
+       // band boundaries
+       int miny1;
+       int maxy1;
+       int miny2;
+       int maxy2;
  
  
-       bool waiting;
-#ifdef USE_THREADS
-       SDL_cond *waitcond;
-#endif
+       ATOMIC(volatile int commandoffset);
+
+       volatile bool waiting;
+       volatile bool starving;
+       void *waitcond;
+       void *drawcond;
+       void *drawmutex;
  
         int numspans;
         int numtriangles;
  
         int numspans;
         int numtriangles;
@@ -314,14 +322,12 @@ typedef ATOMIC(struct DPSOFTRAST_State_s
         // error reporting
         const char *errorstring;
  
         // error reporting
         const char *errorstring;
  
+       bool usethreads;
+       int interlace;
         int numthreads;
         DPSOFTRAST_State_Thread *threads;
         int numthreads;
         DPSOFTRAST_State_Thread *threads;
-#ifdef USE_THREADS
-       SDL_mutex *drawmutex;
-       SDL_cond *drawcond;
-#endif
  
  
-       ATOMIC(int drawcommand);
+       ATOMIC(volatile int drawcommand);
  
         DPSOFTRAST_State_Command_Pool commandpool;
  }
  
         DPSOFTRAST_State_Command_Pool commandpool;
  }
@@ -329,8 +335,6 @@ DPSOFTRAST_State);
  
  DPSOFTRAST_State dpsoftrast;
  
  
  DPSOFTRAST_State dpsoftrast;
  
-extern int dpsoftrast_test;
-
  #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
  #define DPSOFTRAST_DEPTHOFFSET (128.0f)
  #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
  #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
  #define DPSOFTRAST_DEPTHOFFSET (128.0f)
  #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
@@ -364,10 +368,10 @@ static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
         if (y1 < 0) y1 = 0;
         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
         if (y1 < 0) y1 = 0;
         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
-       thread->fb_clearscissor[0] = x1;
-       thread->fb_clearscissor[1] = y1;
-       thread->fb_clearscissor[2] = x2 - x1;
-       thread->fb_clearscissor[3] = y2 - y1;
+       thread->fb_scissor[0] = x1;
+       thread->fb_scissor[1] = y1;
+       thread->fb_scissor[2] = x2 - x1;
+       thread->fb_scissor[3] = y2 - y1;
  
         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
  }
  
         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
  }
@@ -385,7 +389,7 @@ static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
                 {
                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
                 {
                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
-               BLENDFUNC(GL_SRC_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
+               BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
                 }
         }
                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
                 }
         }
@@ -402,7 +406,7 @@ static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
-               BLENDFUNC(GL_SRC_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
+               BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
                 }
         }
                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
                 }
         }
@@ -453,13 +457,13 @@ static void DPSOFTRAST_Texture_Grow(void)
                 dpsoftrast.texture_max *= 2;
         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
                 dpsoftrast.texture_max *= 2;
         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
-               if(dpsoftrast.texbound[i])
+               if (dpsoftrast.texbound[i])
                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
         for (j = 0; j < dpsoftrast.numthreads; j++)
         {
                 thread = &dpsoftrast.threads[j];
                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
         for (j = 0; j < dpsoftrast.numthreads; j++)
         {
                 thread = &dpsoftrast.threads[j];
                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
-                       if(thread->texbound[i])
+                       if (thread->texbound[i])
                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
         }
  }
                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
         }
  }
@@ -764,13 +768,12 @@ static void DPSOFTRAST_Draw_FlushThreads(void);
  
  static void DPSOFTRAST_Draw_SyncCommands(void)
  {
  
  static void DPSOFTRAST_Draw_SyncCommands(void)
  {
-       MEMORY_BARRIER;
+       if(dpsoftrast.usethreads) MEMORY_BARRIER;
         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
  }
  
  static void DPSOFTRAST_Draw_FreeCommandPool(int space)
  {
         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
  }
  
  static void DPSOFTRAST_Draw_FreeCommandPool(int space)
  {
-#ifdef USE_THREADS
         DPSOFTRAST_State_Thread *thread;
         int i;
         int freecommand = dpsoftrast.commandpool.freecommand;
         DPSOFTRAST_State_Thread *thread;
         int i;
         int freecommand = dpsoftrast.commandpool.freecommand;
@@ -778,7 +781,6 @@ static void DPSOFTRAST_Draw_FreeCommandPool(int space)
         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
                 return;
         DPSOFTRAST_Draw_SyncCommands();
         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
                 return;
         DPSOFTRAST_Draw_SyncCommands();
-       SDL_LockMutex(dpsoftrast.drawmutex);
         for(;;)
         {
                 int waitindex = -1;
         for(;;)
         {
                 int waitindex = -1;
@@ -799,16 +801,17 @@ static void DPSOFTRAST_Draw_FreeCommandPool(int space)
                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
                         break;
                 thread = &dpsoftrast.threads[waitindex];
                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
                         break;
                 thread = &dpsoftrast.threads[waitindex];
-               thread->waiting = true;
-               SDL_CondBroadcast(dpsoftrast.drawcond);
-               SDL_CondWait(thread->waitcond, dpsoftrast.drawmutex);
-               thread->waiting = false;
+               Thread_LockMutex(thread->drawmutex);
+               if (thread->commandoffset != dpsoftrast.drawcommand)
+               {
+                       thread->waiting = true;
+                       if (thread->starving) Thread_CondSignal(thread->drawcond);
+                       Thread_CondWait(thread->waitcond, thread->drawmutex);
+                       thread->waiting = false;
+               }
+               Thread_UnlockMutex(thread->drawmutex);
         }
         }
-       SDL_UnlockMutex(dpsoftrast.drawmutex);
         dpsoftrast.commandpool.usedcommands = usedcommands;
         dpsoftrast.commandpool.usedcommands = usedcommands;
-#else
-       DPSOFTRAST_Draw_FlushThreads();
-#endif
  }
  
  #define DPSOFTRAST_ALIGNCOMMAND(size) \
  }
  
  #define DPSOFTRAST_ALIGNCOMMAND(size) \
@@ -822,15 +825,18 @@ static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
         int freecommand = dpsoftrast.commandpool.freecommand;
         int usedcommands = dpsoftrast.commandpool.usedcommands;
         int extra = sizeof(DPSOFTRAST_Command);
         int freecommand = dpsoftrast.commandpool.freecommand;
         int usedcommands = dpsoftrast.commandpool.usedcommands;
         int extra = sizeof(DPSOFTRAST_Command);
-       if(DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
+       if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
-       if(usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
+       if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
         {
         {
-               DPSOFTRAST_Draw_FreeCommandPool(size + extra);
+               if (dpsoftrast.usethreads)
+                       DPSOFTRAST_Draw_FreeCommandPool(size + extra);
+               else
+                       DPSOFTRAST_Draw_FlushThreads();
                 freecommand = dpsoftrast.commandpool.freecommand;
                 usedcommands = dpsoftrast.commandpool.usedcommands;
         }
                 freecommand = dpsoftrast.commandpool.freecommand;
                 usedcommands = dpsoftrast.commandpool.usedcommands;
         }
-       if(DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
+       if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
         {
                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
                 command->opcode = DPSOFTRAST_OPCODE_Reset;
         {
                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
                 command->opcode = DPSOFTRAST_OPCODE_Reset;
@@ -853,6 +859,8 @@ static void DPSOFTRAST_UndoCommand(int size)
         int freecommand = dpsoftrast.commandpool.freecommand;
         int usedcommands = dpsoftrast.commandpool.usedcommands;
         freecommand -= size;
         int freecommand = dpsoftrast.commandpool.freecommand;
         int usedcommands = dpsoftrast.commandpool.usedcommands;
         freecommand -= size;
+       if (freecommand < 0)
+               freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
         usedcommands -= size;
         dpsoftrast.commandpool.freecommand = freecommand;
         dpsoftrast.commandpool.usedcommands = usedcommands;
         usedcommands -= size;
         dpsoftrast.commandpool.freecommand = freecommand;
         dpsoftrast.commandpool.usedcommands = usedcommands;
@@ -885,18 +893,21 @@ void DPSOFTRAST_Viewport(int x, int y, int width, int height)
  DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
  static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
  {
  DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
  static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
  {
-       int i, x1, y1, x2, y2, w, h, x, y, t1, t2;
+       int i, x1, y1, x2, y2, w, h, x, y;
+       int miny1 = thread->miny1;
+       int maxy1 = thread->maxy1;
+       int miny2 = thread->miny2;
+       int maxy2 = thread->maxy2;
+       int bandy;
         unsigned int *p;
         unsigned int c;
         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
         unsigned int *p;
         unsigned int c;
         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
-       x1 = thread->fb_clearscissor[0];
-       y1 = thread->fb_clearscissor[1];
-       x2 = thread->fb_clearscissor[2];
-       y2 = thread->fb_clearscissor[1] + thread->fb_clearscissor[3];
-       t1 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
-       t2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
-       if(y1 < t1) y1 = t1;
-       if(y2 > t2) y2 = t2;
+       x1 = thread->fb_scissor[0];
+       y1 = thread->fb_scissor[1];
+       x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
+       y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
+       if (y1 < miny1) y1 = miny1;
+       if (y2 > maxy2) y2 = maxy2;
         w = x2 - x1;
         h = y2 - y1;
         if (w < 1 || h < 1)
         w = x2 - x1;
         h = y2 - y1;
         if (w < 1 || h < 1)
@@ -907,7 +918,8 @@ static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, con
         {
                 if (!dpsoftrast.fb_colorpixels[i])
                         continue;
         {
                 if (!dpsoftrast.fb_colorpixels[i])
                         continue;
-               for (y = y1;y < y2;y++)
+               for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
+               for (;y < bandy;y++)
                 {
                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
                         for (x = x1;x < x2;x++)
                 {
                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
                         for (x = x1;x < x2;x++)
@@ -927,24 +939,28 @@ void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
  DEFCOMMAND(3, ClearDepth, float depth;)
  static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
  {
  DEFCOMMAND(3, ClearDepth, float depth;)
  static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
  {
-       int x1, y1, x2, y2, w, h, x, y, t1, t2;
+       int x1, y1, x2, y2, w, h, x, y;
+       int miny1 = thread->miny1;
+       int maxy1 = thread->maxy1;
+       int miny2 = thread->miny2;
+       int maxy2 = thread->maxy2;
+       int bandy;
         unsigned int *p;
         unsigned int c;
         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
         unsigned int *p;
         unsigned int c;
         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
-       x1 = thread->fb_clearscissor[0];
-       y1 = thread->fb_clearscissor[1];
-       x2 = thread->fb_clearscissor[2];
-       y2 = thread->fb_clearscissor[1] + thread->fb_clearscissor[3];
-       t1 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
-       t2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
-       if(y1 < t1) y1 = t1;
-       if(y2 > t2) y2 = t2;
+       x1 = thread->fb_scissor[0];
+       y1 = thread->fb_scissor[1];
+       x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
+       y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
+       if (y1 < miny1) y1 = miny1;
+       if (y2 > maxy2) y2 = maxy2;
         w = x2 - x1;
         h = y2 - y1;
         if (w < 1 || h < 1)
                 return;
         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
         w = x2 - x1;
         h = y2 - y1;
         if (w < 1 || h < 1)
                 return;
         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
-       for (y = y1;y < y2;y++)
+       for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
+       for (;y < bandy;y++)
         {
                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
                 for (x = x1;x < x2;x++)
         {
                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
                 for (x = x1;x < x2;x++)
@@ -1143,7 +1159,6 @@ void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockh
         int bx2 = blockx + blockwidth;
         int by2 = blocky + blockheight;
         int bw;
         int bx2 = blockx + blockwidth;
         int by2 = blocky + blockheight;
         int bw;
-       int bh;
         int x;
         int y;
         unsigned char *inpixels;
         int x;
         int y;
         unsigned char *inpixels;
@@ -1155,7 +1170,6 @@ void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockh
         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
         bw = bx2 - bx1;
         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
         bw = bx2 - bx1;
-       bh = by2 - by1;
         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
         if (dpsoftrast.bigendian)
         {
         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
         if (dpsoftrast.bigendian)
         {
@@ -1209,8 +1223,7 @@ void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int s
         DPSOFTRAST_Texture *texture;
         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
         if (mip < 0 || mip >= texture->mipmaps) return;
         DPSOFTRAST_Texture *texture;
         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
         if (mip < 0 || mip >= texture->mipmaps) return;
-       if (texture->binds)
-               DPSOFTRAST_Flush();
+       DPSOFTRAST_Flush();
         spixels = dpsoftrast.fb_colorpixels[0];
         swidth = dpsoftrast.fb_width;
         sheight = dpsoftrast.fb_height;
         spixels = dpsoftrast.fb_colorpixels[0];
         swidth = dpsoftrast.fb_width;
         sheight = dpsoftrast.fb_height;
@@ -1351,7 +1364,7 @@ void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int
         {
                 __m128 m0, m1, m2, m3;
                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
         {
                 __m128 m0, m1, m2, m3;
                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
-               command->index = index;
+               command->index = (DPSOFTRAST_UNIFORM)index;
                 if (((size_t)v)&(ALIGN_SIZE-1))
                 {
                         m0 = _mm_loadu_ps(v);
                 if (((size_t)v)&(ALIGN_SIZE-1))
                 {
                         m0 = _mm_loadu_ps(v);
@@ -1747,11 +1760,9 @@ static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, __m128 minpos, __m12
         *endy = _mm_cvttss_si32(minproj)+1;
         return clipmask;
  }
         *endy = _mm_cvttss_si32(minproj)+1;
         return clipmask;
  }
-#endif
         
  static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
  {
         
  static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
  {
-#ifdef SSE2_PRESENT
         float *end = out4f + numitems*4;
         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
         __m128 minpos, maxpos;
         float *end = out4f + numitems*4;
         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
         __m128 minpos, maxpos;
@@ -1794,12 +1805,10 @@ static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty,
                                         _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f),
                                         _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f));
         return 0;
                                         _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f),
                                         _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f));
         return 0;
-#endif
  }
  
  static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
  {
  }
  
  static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
  {
-#ifdef SSE2_PRESENT
         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
         float *end;
         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
         float *end;
@@ -1849,11 +1858,12 @@ static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int
         if (starty && endy) 
                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, m0, m1, m2, m3); 
         return 0;
         if (starty && endy) 
                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, m0, m1, m2, m3); 
         return 0;
-#endif
  }
  }
+#endif
  
  static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
  {
  
  static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
  {
+#ifdef SSE2_PRESENT
         float *outf = dpsoftrast.post_array4f[outarray];
         const unsigned char *inb;
         int firstvertex = dpsoftrast.firstvertex;
         float *outf = dpsoftrast.post_array4f[outarray];
         const unsigned char *inb;
         int firstvertex = dpsoftrast.firstvertex;
@@ -1905,6 +1915,9 @@ static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
                 break;
         }
         return outf;
                 break;
         }
         return outf;
+#else
+       return NULL;
+#endif
  }
  
  static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
  }
  
  static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
@@ -1917,17 +1930,25 @@ static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float
  #if 0
  static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
  {
  #if 0
  static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
  {
+#ifdef SSE2_PRESENT
         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
         return data;
         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
         return data;
+#else
+       return NULL;
+#endif
  }
  #endif
  
  static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
  {
  }
  #endif
  
  static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
  {
+#ifdef SSE2_PRESENT
         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
         return data;
         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
         return data;
+#else
+       return NULL;
+#endif
  }
  
  void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
  }
  
  void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
@@ -1942,7 +1963,7 @@ void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAS
         {
                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
                 float z = endz, dz;
         {
                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
                 float z = endz, dz;
-               if(nextsub >= endx) nextsub = endsub = endx-1;
+               if (nextsub >= endx) nextsub = endsub = endx-1;
                 endz = 1.0f / (w + wslope * nextsub);
                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
                 for (; x <= endsub; x++, z += dz)
                 endz = 1.0f / (w + wslope * nextsub);
                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
                 for (; x <= endsub; x++, z += dz)
@@ -2111,6 +2132,21 @@ void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRA
                         pixel[x*4+3] = d[3];
                 }
                 break;
                         pixel[x*4+3] = d[3];
                 }
                 break;
+       case DPSOFTRAST_BLENDMODE_INVADD:
+               for (x = startx;x < endx;x++)
+               {
+                       if (!pixelmask[x])
+                               continue;
+                       d[0] = (int)((255.0f-pixel[x*4+2])*in4f[x*4+0] + pixel[x*4+2]);if (d[0] > 255) d[0] = 255;
+                       d[1] = (int)((255.0f-pixel[x*4+1])*in4f[x*4+1] + pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
+                       d[2] = (int)((255.0f-pixel[x*4+0])*in4f[x*4+2] + pixel[x*4+0]);if (d[2] > 255) d[2] = 255;
+                       d[3] = (int)((255.0f-pixel[x*4+3])*in4f[x*4+3] + pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
+                       pixel[x*4+0] = d[0];
+                       pixel[x*4+1] = d[1];
+                       pixel[x*4+2] = d[2];
+                       pixel[x*4+3] = d[3];
+               }
+               break;
         }
  }
  
         }
  }
  
@@ -2157,7 +2193,7 @@ void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPS
                 break;
         case DPSOFTRAST_BLENDMODE_ALPHA:
         #define FINISHBLEND(blend2, blend1) \
                 break;
         case DPSOFTRAST_BLENDMODE_ALPHA:
         #define FINISHBLEND(blend2, blend1) \
-               for (x = startx;x + 2 <= endx;x += 2) \
+               for (x = startx;x + 1 < endx;x += 2) \
                 { \
                         __m128i src, dst; \
                         switch (*(const unsigned short*)&pixelmask[x]) \
                 { \
                         __m128i src, dst; \
                         switch (*(const unsigned short*)&pixelmask[x]) \
@@ -2245,6 +2281,13 @@ void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPS
                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
                 });
                 break;
                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
                 });
                 break;
+       case DPSOFTRAST_BLENDMODE_INVADD:
+               FINISHBLEND({
+                       dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
+               }, {
+                       dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
+               });
+               break;
         }
  #endif
  }
         }
  #endif
  }
@@ -2321,10 +2364,10 @@ void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, cons
                 unsigned int substep[2];
                 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
                 unsigned int substep[2];
                 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
-               if(nextsub >= endx)
+               if (nextsub >= endx)
                 {
                         nextsub = endsub = endx-1;      
                 {
                         nextsub = endsub = endx-1;      
-                       if(x < nextsub) subscale = 65536.0f / (nextsub - x);
+                       if (x < nextsub) subscale = 65536.0f / (nextsub - x);
                 }
                 tc[0] = endtc[0];
                 tc[1] = endtc[1];
                 }
                 tc[0] = endtc[0];
                 tc[1] = endtc[1];
@@ -2334,7 +2377,7 @@ void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, cons
                 substep[1] = (endtc[1] - tc[1]) * subscale;
                 subtc[0] = tc[0] * (1<<16);
                 subtc[1] = tc[1] * (1<<16);
                 substep[1] = (endtc[1] - tc[1]) * subscale;
                 subtc[0] = tc[0] * (1<<16);
                 subtc[1] = tc[1] * (1<<16);
-               if(filter)
+               if (filter)
                 {
                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
                         {
                 {
                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
                         {
@@ -2455,7 +2498,7 @@ void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread,
         // if no texture is bound, just fill it with white
         if (!texture)
         {
         // if no texture is bound, just fill it with white
         if (!texture)
         {
-               memset(out4ub + startx*4, 255, span->length*4);
+               memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
                 return;
         }
         mip = triangle->mip[texunitindex];
                 return;
         }
         mip = triangle->mip[texunitindex];
@@ -2484,10 +2527,10 @@ void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread,
         {
                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
         {
                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
-               if(nextsub >= endx)
+               if (nextsub >= endx)
                 {
                         nextsub = endsub = endx-1;
                 {
                         nextsub = endsub = endx-1;
-                       if(x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
+                       if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
                 }       
                 tc = endtc;
                 subtc = endsubtc;
                 }       
                 tc = endtc;
                 subtc = endsubtc;
@@ -2714,7 +2757,7 @@ void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread,
  void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
  {
         // TODO: IMPLEMENT
  void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
  {
         // TODO: IMPLEMENT
-       memset(out4ub, 255, span->length*4);
+       memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
  }
  
  float DPSOFTRAST_SampleShadowmap(const float *vector)
  }
  
  float DPSOFTRAST_SampleShadowmap(const float *vector)
@@ -2859,29 +2902,42 @@ void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle *
         int startx = span->startx;
         int endx = span->endx;
         __m128 data, slope;
         int startx = span->startx;
         int endx = span->endx;
         __m128 data, slope;
+       __m128 mod, endmod;
+       __m128i submod, substep, endsubmod;
         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
-       data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
-       data = _mm_mul_ps(data, _mm_set1_ps(256.0f));
-       slope = _mm_mul_ps(slope, _mm_set1_ps(256.0f));
-       for (x = startx;x+2 <= endx;x += 2, data = _mm_add_ps(data, slope))
-       {
-               __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
-               __m128i mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x]))), mod2;
-               data = _mm_add_ps(data, slope);
-               mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x+1])));
-               mod = _mm_unpacklo_epi64(_mm_packs_epi32(mod, mod), _mm_packs_epi32(mod2, mod2));
-               pix = _mm_mulhi_epu16(pix, mod);
-               _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
-       }
-       for (;x < endx;x++, data = _mm_add_ps(data, slope))
+       endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
+       endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
+       for (x = startx; x < endx;)
         {
         {
-               __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
-               __m128i mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x])));
-               mod = _mm_packs_epi32(mod, mod);
-               pix = _mm_mulhi_epu16(pix, mod);
-               *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
+               int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
+               __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
+               if (nextsub >= endx)
+               {
+                       nextsub = endsub = endx-1;
+                       if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
+               }
+               mod = endmod;
+               submod = endsubmod;
+               endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
+               substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
+               endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
+               submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
+               substep = _mm_packs_epi32(substep, substep);
+               for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
+               {
+                       __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
+                       pix = _mm_mulhi_epu16(pix, submod);
+                       _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
+               }
+               if (x <= endsub)
+               {
+                       __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
+                       pix = _mm_mulhi_epu16(pix, submod);
+                       *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
+                       x++;
+               }
         }
  #endif
  }
         }
  #endif
  }
@@ -2893,25 +2949,40 @@ void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRIC
         int startx = span->startx;
         int endx = span->endx;
         __m128 data, slope;
         int startx = span->startx;
         int endx = span->endx;
         __m128 data, slope;
+       __m128 mod, endmod;
+       __m128i submod, substep, endsubmod;
         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
-       data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
-       data = _mm_mul_ps(data, _mm_set1_ps(255.0f));
-       slope = _mm_mul_ps(slope, _mm_set1_ps(255.0f));
-       for (x = startx;x+2 <= endx;x += 2, data = _mm_add_ps(data, slope))
-       {
-               __m128i pix = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x]))), pix2;
-               data = _mm_add_ps(data, slope);
-               pix2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x+1])));
-               pix = _mm_unpacklo_epi64(_mm_packs_epi32(pix, pix), _mm_packs_epi32(pix2, pix2));
-               _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
-       }
-       for (;x < endx;x++, data = _mm_add_ps(data, slope))
+       endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
+       endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
+       for (x = startx; x < endx;)
         {
         {
-               __m128i pix = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x])));
-               pix = _mm_packs_epi32(pix, pix);
-               *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
+               int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
+               __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
+               if (nextsub >= endx)
+               {
+                       nextsub = endsub = endx-1;
+                       if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
+               }
+               mod = endmod;
+               submod = endsubmod;
+               endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
+               substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
+               endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
+               submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
+               substep = _mm_packs_epi32(substep, substep);
+               for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
+               {
+                       __m128i pix = _mm_srai_epi16(submod, 4);
+                       _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
+               }
+               if (x <= endsub)
+               {
+                       __m128i pix = _mm_srai_epi16(submod, 4);
+                       *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
+                       x++;
+               }
         }
  #endif
  }
         }
  #endif
  }
@@ -2929,7 +3000,7 @@ void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRI
                 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
         }
                 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
         }
-       if(x < endx)
+       if (x < endx)
         {
                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
         {
                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
@@ -2950,7 +3021,7 @@ void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle *
                 pix1 = _mm_mulhi_epu16(pix1, pix2);
                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
         }
                 pix1 = _mm_mulhi_epu16(pix1, pix2);
                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
         }
-       if(x < endx)
+       if (x < endx)
         {
                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
         {
                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
@@ -2971,7 +3042,7 @@ void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * REST
                 pix1 = _mm_add_epi16(pix1, pix2);
                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
         }
                 pix1 = _mm_add_epi16(pix1, pix2);
                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
         }
-       if(x < endx)
+       if (x < endx)
         {
                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
         {
                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
@@ -2994,7 +3065,7 @@ void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle
                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
         }
                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
         }
-       if(x < endx)
+       if (x < endx)
         {
                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
         {
                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
@@ -3016,7 +3087,7 @@ void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * REST
                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
         }
                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
         }
-       if(x < endx)
+       if (x < endx)
         {
                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
         {
                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
@@ -3040,7 +3111,7 @@ void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle *
                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
         }
                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
         }
-       if(x < endx)
+       if (x < endx)
         {
                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
         {
                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
@@ -3143,7 +3214,7 @@ void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, con
         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
-       memset(buffer_FragColorbgra8, 0, span->length*4);
+       memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
  }
  
         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
  }
  
@@ -3157,25 +3228,44 @@ void DPSOFTRAST_VertexShader_FlatColor(void)
  
  void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
  {
  
  void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
  {
+#ifdef SSE2_PRESENT
+       unsigned char * RESTRICT pixelmask = span->pixelmask;
+       unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
         int x, startx = span->startx, endx = span->endx;
         int x, startx = span->startx, endx = span->endx;
-       int Color_Ambienti[4];
+       __m128i Color_Ambientm;
         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
-       Color_Ambienti[2] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0]*256.0f);
-       Color_Ambienti[1] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1]*256.0f);
-       Color_Ambienti[0] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2]*256.0f);
-       Color_Ambienti[3] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]        *256.0f);
         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
+       if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
+               pixel = buffer_FragColorbgra8;
+       Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
+       Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
+       Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
+       Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
         for (x = startx;x < endx;x++)
         {
         for (x = startx;x < endx;x++)
         {
-               buffer_FragColorbgra8[x*4+0] = (buffer_texture_colorbgra8[x*4+0] * Color_Ambienti[0])>>8;
-               buffer_FragColorbgra8[x*4+1] = (buffer_texture_colorbgra8[x*4+1] * Color_Ambienti[1])>>8;
-               buffer_FragColorbgra8[x*4+2] = (buffer_texture_colorbgra8[x*4+2] * Color_Ambienti[2])>>8;
-               buffer_FragColorbgra8[x*4+3] = (buffer_texture_colorbgra8[x*4+3] * Color_Ambienti[3])>>8;
+               __m128i color, pix;
+               if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
+               {
+                       __m128i pix2;
+                       color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
+                       pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
+                       pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
+                       _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
+                       x += 3;
+                       continue;
+               }
+               if (!pixelmask[x])
+                       continue;
+               color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
+               pix = _mm_mulhi_epu16(Color_Ambientm, color);
+               *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
         }
         }
-       DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
+       if (pixel == buffer_FragColorbgra8)
+               DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
+#endif
  }
  
  
  }
  
  
@@ -3239,7 +3329,7 @@ void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const D
                         x += 3;
                         continue;
                 }
                         x += 3;
                         continue;
                 }
-               if(!pixelmask[x])
+               if (!pixelmask[x])
                         continue;
                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
                         continue;
                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
@@ -3247,7 +3337,7 @@ void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const D
                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
         }
                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
         }
-       if(pixel == buffer_FragColorbgra8)
+       if (pixel == buffer_FragColorbgra8)
                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
  #endif
  }
                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
  #endif
  }
@@ -3311,7 +3401,7 @@ void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSO
                                 x += 3;
                                 continue;
                         }
                                 x += 3;
                                 continue;
                         }
-                       if(!pixelmask[x])
+                       if (!pixelmask[x])
                                 continue;
                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
                                 continue;
                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
@@ -3339,7 +3429,7 @@ void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSO
                                 x += 3;
                                 continue;
                         }
                                 x += 3;
                                 continue;
                         }
-                       if(!pixelmask[x]) 
+                       if (!pixelmask[x]) 
                                 continue;
                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
                                 continue;
                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
@@ -3347,7 +3437,7 @@ void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSO
                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
                 }
         }
                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
                 }
         }
-       if(pixel == buffer_FragColorbgra8)
+       if (pixel == buffer_FragColorbgra8)
                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
  #endif
  }
                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
  #endif
  }
@@ -3365,7 +3455,7 @@ void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPS
         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
-       memset(buffer_FragColorbgra8, 0, span->length*4);
+       memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
  }
  
         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
  }
  
@@ -4048,7 +4138,7 @@ void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DP
         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
-       memset(buffer_FragColorbgra8, 0, span->length*4);
+       memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
  }
  
         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
  }
  
@@ -4066,7 +4156,7 @@ void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTR
         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
-       memset(buffer_FragColorbgra8, 0, span->length*4);
+       memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
  }
  
         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
  }
  
@@ -4083,7 +4173,7 @@ void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPS
         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
-       memset(buffer_FragColorbgra8, 0, span->length*4);
+       memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
  }
  
         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
  }
  
@@ -4100,7 +4190,7 @@ void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, co
         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
-       memset(buffer_FragColorbgra8, 0, span->length*4);
+       memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
  }
  
         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
  }
  
@@ -4117,7 +4207,24 @@ void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread,
         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
-       memset(buffer_FragColorbgra8, 0, span->length*4);
+       memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
+       DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
+}
+
+
+
+void DPSOFTRAST_VertexShader_DeferredBounceLight(void)
+{
+       DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
+}
+
+void DPSOFTRAST_PixelShader_DeferredBounceLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
+{
+       // TODO: IMPLEMENT
+       float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
+       unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
+       DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
+       memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
  }
  
         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
  }
  
@@ -4150,7 +4257,8 @@ static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COU
         {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                          {~0}},
         {2, DPSOFTRAST_VertexShader_ShowDepth,                      DPSOFTRAST_PixelShader_ShowDepth,                      {~0}},
         {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,               {~0}},
         {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                          {~0}},
         {2, DPSOFTRAST_VertexShader_ShowDepth,                      DPSOFTRAST_PixelShader_ShowDepth,                      {~0}},
         {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,               {~0}},
-       {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}}
+       {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}},
+       {2, DPSOFTRAST_VertexShader_DeferredBounceLight,        DPSOFTRAST_PixelShader_DeferredBounceLight,        {~0}}
  };
  
  void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
  };
  
  void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
@@ -4181,23 +4289,23 @@ void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
                         depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
                         depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
                         depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
                         depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
                         depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
                         depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
+                       startx = span->startx;
+                       endx = span->endx;
                         switch(thread->fb_depthfunc)
                         {
                         default:
                         switch(thread->fb_depthfunc)
                         {
                         default:
-                       case GL_ALWAYS:  for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = true; break;
-                       case GL_LESS:    for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
-                       case GL_LEQUAL:  for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
-                       case GL_EQUAL:   for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
-                       case GL_GEQUAL:  for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
-                       case GL_GREATER: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
-                       case GL_NEVER:   for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = false; break;
+                       case GL_ALWAYS:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
+                       case GL_LESS:    for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
+                       case GL_LEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
+                       case GL_EQUAL:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
+                       case GL_GEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
+                       case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
+                       case GL_NEVER:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
                         }
                         //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
                         }
                         //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
-                       //for (x = 0;x < span->length;x++)
+                       //for (x = startx;x < endx;x++)
                         //      colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
                         // if there is no color buffer, skip pixel shader
                         //      colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
                         // if there is no color buffer, skip pixel shader
-                       startx = 0;
-                       endx = span->length;
                         while (startx < endx && !pixelmask[startx])
                                 startx++;
                         while (endx > startx && !pixelmask[endx-1])
                         while (startx < endx && !pixelmask[startx])
                                 startx++;
                         while (endx > startx && !pixelmask[endx-1])
@@ -4223,10 +4331,8 @@ void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
                         // if there is no color buffer, skip pixel shader
                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
                         {
                         // if there is no color buffer, skip pixel shader
                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
                         {
-                               memset(pixelmask, 1, span->length);
+                               memset(pixelmask + span->startx, 1, span->endx - span->startx);
                                 span->pixelmask = pixelmask;
                                 span->pixelmask = pixelmask;
-                               span->startx = 0;
-                               span->endx = span->length;
                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
                         }
                 }
                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
                         }
                 }
@@ -4240,9 +4346,8 @@ static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAS
  {
  #ifdef SSE2_PRESENT
         int cullface = thread->cullface;
  {
  #ifdef SSE2_PRESENT
         int cullface = thread->cullface;
-       int width = dpsoftrast.fb_width;
-       int miny = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
-       int maxy = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
+       int minx, maxx, miny, maxy;
+       int miny1, maxy1, miny2, maxy2;
         __m128i fbmin, fbmax;
         __m128 viewportcenter, viewportscale;
         int firstvertex = command->firstvertex;
         __m128i fbmin, fbmax;
         __m128 viewportcenter, viewportscale;
         int firstvertex = command->firstvertex;
@@ -4257,7 +4362,7 @@ static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAS
         int y;
         int e[3];
         __m128i screeny;
         int y;
         int e[3];
         __m128i screeny;
-       int starty, endy;
+       int starty, endy, bandy;
         int numpoints;
         int clipcase;
         float clipdist[4];
         int numpoints;
         int clipcase;
         float clipdist[4];
@@ -4266,7 +4371,14 @@ static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAS
         __m128 screen[4];
         DPSOFTRAST_State_Triangle *triangle;
         DPSOFTRAST_Texture *texture;
         __m128 screen[4];
         DPSOFTRAST_State_Triangle *triangle;
         DPSOFTRAST_Texture *texture;
-       if (command->starty >= maxy || command->endy <= miny)
+       DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
+       miny = thread->fb_scissor[1];
+       maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
+       miny1 = bound(miny, thread->miny1, maxy);
+       maxy1 = bound(miny, thread->maxy1, maxy);
+       miny2 = bound(miny, thread->miny2, maxy);
+       maxy2 = bound(miny, thread->maxy2, maxy);
+       if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
         {
                 if (!ATOMIC_DECREMENT(command->refcount))
                 {
         {
                 if (!ATOMIC_DECREMENT(command->refcount))
                 {
@@ -4275,9 +4387,10 @@ static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAS
                 }
                 return;
         }
                 }
                 return;
         }
-       DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
-       fbmin = _mm_setr_epi16(0, miny, 0, miny, 0, miny, 0, miny);
-       fbmax = _mm_sub_epi16(_mm_setr_epi16(width, maxy, width, maxy, width, maxy, width, maxy), _mm_set1_epi16(1));
+       minx = thread->fb_scissor[0];
+       maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
+       fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
+       fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
         viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
         viewportscale = _mm_load_ps(thread->fb_viewportscale);
         screen[3] = _mm_setzero_ps();
         viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
         viewportscale = _mm_load_ps(thread->fb_viewportscale);
         screen[3] = _mm_setzero_ps();
@@ -4289,18 +4402,18 @@ static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAS
  
                 // generate the 3 edges of this triangle
                 // generate spans for the triangle - switch based on left split or right split classification of triangle
  
                 // generate the 3 edges of this triangle
                 // generate spans for the triangle - switch based on left split or right split classification of triangle
-               if (element3i)
-               {
-                       e[0] = element3i[i*3+0] - firstvertex;
-                       e[1] = element3i[i*3+1] - firstvertex;
-                       e[2] = element3i[i*3+2] - firstvertex;
-               }
-               else if (element3s)
+               if (element3s)
                 {
                         e[0] = element3s[i*3+0] - firstvertex;
                         e[1] = element3s[i*3+1] - firstvertex;
                         e[2] = element3s[i*3+2] - firstvertex;
                 }
                 {
                         e[0] = element3s[i*3+0] - firstvertex;
                         e[1] = element3s[i*3+1] - firstvertex;
                         e[2] = element3s[i*3+2] - firstvertex;
                 }
+               else if (element3i)
+               {
+                       e[0] = element3i[i*3+0] - firstvertex;
+                       e[1] = element3i[i*3+1] - firstvertex;
+                       e[2] = element3i[i*3+2] - firstvertex;
+               }
                 else
                 {
                         e[0] = i*3+0;
                 else
                 {
                         e[0] = i*3+0;
@@ -4442,6 +4555,8 @@ static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAS
                                 continue;
                         starty = _mm_extract_epi16(screenmin, 1);
                         endy = _mm_extract_epi16(screenmax, 1)+1;
                                 continue;
                         starty = _mm_extract_epi16(screenmin, 1);
                         endy = _mm_extract_epi16(screenmax, 1)+1;
+                       if (starty >= maxy1 && endy <= miny2)
+                               continue;
                         screeny = _mm_srai_epi32(screeni, 16);
                 }
  
                         screeny = _mm_srai_epi32(screeni, 16);
                 }
  
@@ -4527,8 +4642,9 @@ static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAS
                                 }
                         }
                 }
                                 }
                         }
                 }
-
-               for (y = starty; y < endy;)
+       
+               for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
+               for (; y < bandy;)
                 {
                         __m128 xcoords, xslope;
                         __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
                 {
                         __m128 xcoords, xslope;
                         __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
@@ -4577,39 +4693,39 @@ static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAS
                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
                         nexty = _mm_extract_epi16(ycc, 0);
                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
                         nexty = _mm_extract_epi16(ycc, 0);
-                       if(nexty >= endy) nexty = endy-1;
-                       if (_mm_ucomigt_ss(_mm_max_ss(screen[edge0n], screen[edge0p]), _mm_min_ss(screen[edge1n], screen[edge1p])))
-                       {
-                               int tmp = edge0n;
-                               edge0n = edge1n;
-                               edge1n = tmp;
-                               tmp = edge0p;
-                               edge0p = edge1p;
-                               edge1p = tmp;
-                       }
+                       if (nexty >= bandy) nexty = bandy-1;
                         xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
                         xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
                         xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
                                                                 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
                         xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
                         xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
                         xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
                         xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
                                                                 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
                         xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
+                       if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
+                       {
+                               xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
+                               xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
+                       }
                         for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
                         {
                                 int startx, endx, offset;
                                 startx = _mm_cvtss_si32(xcoords);
                                 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
                         for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
                         {
                                 int startx, endx, offset;
                                 startx = _mm_cvtss_si32(xcoords);
                                 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
-                               if (startx < 0) startx = 0;
-                               if (endx > dpsoftrast.fb_width) endx = dpsoftrast.fb_width;
+                               if (startx < minx) 
+                               {
+                                       if (startx < 0) startx = 0;
+                                       startx += (minx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
+                               }
+                               if (endx > maxx) endx = maxx;
                                 if (startx >= endx) continue;
                                 if (startx >= endx) continue;
-                               for (offset = startx; offset < endx;)
+                               for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
                                 {
                                         DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
                                         span->triangle = thread->numtriangles;
                                         span->x = offset;
                                         span->y = y;
                                 {
                                         DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
                                         span->triangle = thread->numtriangles;
                                         span->x = offset;
                                         span->y = y;
-                                       span->length = endx - offset;
-                                       if (span -> length > DPSOFTRAST_DRAW_MAXSPANLENGTH)
-                                               span -> length = DPSOFTRAST_DRAW_MAXSPANLENGTH;
-                                       offset += span->length;
+                                       span->startx = max(minx - offset, 0);
+                                       span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
+                                       if (span->startx >= span->endx)
+                                               continue; 
                                         if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
                                                 DPSOFTRAST_Draw_ProcessSpans(thread);
                                 }
                                         if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
                                                 DPSOFTRAST_Draw_ProcessSpans(thread);
                                 }
@@ -4652,10 +4768,10 @@ static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstver
                         break;
                 datasize += numvertices*sizeof(float[4]);
         }
                         break;
                 datasize += numvertices*sizeof(float[4]);
         }
-       if (element3i)
-               datasize += numtriangles*sizeof(int[3]);
-       else if (element3s)
+       if (element3s)
                 datasize += numtriangles*sizeof(unsigned short[3]);
                 datasize += numtriangles*sizeof(unsigned short[3]);
+       else if (element3i)
+               datasize += numtriangles*sizeof(int[3]);
         datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
         if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
         {
         datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
         if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
         {
@@ -4688,16 +4804,16 @@ static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstver
         }
         command->element3i = NULL;
         command->element3s = NULL;
         }
         command->element3i = NULL;
         command->element3s = NULL;
-       if (element3i)
-       {
-               command->element3i = (int *)data;
-               memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
-       }
-       else if (element3s)
+       if (element3s)
         {
                 command->element3s = (unsigned short *)data;
                 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
         }
         {
                 command->element3s = (unsigned short *)data;
                 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
         }
+       else if (element3i)
+       {
+               command->element3i = (int *)data;
+               memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
+       }
         return command;
  }
  
         return command;
  }
  
@@ -4717,14 +4833,21 @@ void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles
         command->clipped = dpsoftrast.drawclipped;
         command->refcount = dpsoftrast.numthreads;
  
         command->clipped = dpsoftrast.drawclipped;
         command->refcount = dpsoftrast.numthreads;
  
-#ifdef USE_THREADS
-       DPSOFTRAST_Draw_SyncCommands();
-       //SDL_LockMutex(dpsoftrast.drawmutex);
-       SDL_CondBroadcast(dpsoftrast.drawcond);
-       //SDL_UnlockMutex(dpsoftrast.drawmutex);
-#else
-       DPSOFTRAST_Draw_FlushThreads();
-#endif
+       if (dpsoftrast.usethreads)
+       {
+               int i;
+               DPSOFTRAST_Draw_SyncCommands();
+               for (i = 0; i < dpsoftrast.numthreads; i++)
+               {
+                       DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
+                       if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
+                               Thread_CondSignal(thread->drawcond);
+               }
+       }
+       else
+       {
+               DPSOFTRAST_Draw_FlushThreads();
+       }
  }
   
  static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
  }
   
  static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
@@ -4780,7 +4903,6 @@ static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, i
         thread->commandoffset = commandoffset;
  }
  
         thread->commandoffset = commandoffset;
  }
  
-#ifdef USE_THREADS
  static int DPSOFTRAST_Draw_Thread(void *data)
  {
         DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
  static int DPSOFTRAST_Draw_Thread(void *data)
  {
         DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
@@ -4792,48 +4914,63 @@ static int DPSOFTRAST_Draw_Thread(void *data)
                 }
                 else 
                 {
                 }
                 else 
                 {
-                       SDL_LockMutex(dpsoftrast.drawmutex);
-                       if (thread->commandoffset != dpsoftrast.drawcommand)
+                       Thread_LockMutex(thread->drawmutex);
+                       if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
                         {
                         {
-                               SDL_UnlockMutex(dpsoftrast.drawmutex);
-                               continue;
+                               if (thread->waiting) Thread_CondSignal(thread->waitcond);
+                               thread->starving = true;
+                               Thread_CondWait(thread->drawcond, thread->drawmutex);
+                               thread->starving = false;
                         }
                         }
-                       if (thread->waiting) SDL_CondSignal(thread->waitcond);
-                       SDL_CondWait(dpsoftrast.drawcond, dpsoftrast.drawmutex);
-                       SDL_UnlockMutex(dpsoftrast.drawmutex);
+                       Thread_UnlockMutex(thread->drawmutex);
                 }
         }   
         return 0;
  }
                 }
         }   
         return 0;
  }
-#endif
  
  static void DPSOFTRAST_Draw_FlushThreads(void)
  {
         DPSOFTRAST_State_Thread *thread;
         int i;
         DPSOFTRAST_Draw_SyncCommands();
  
  static void DPSOFTRAST_Draw_FlushThreads(void)
  {
         DPSOFTRAST_State_Thread *thread;
         int i;
         DPSOFTRAST_Draw_SyncCommands();
-#ifdef USE_THREADS
-       SDL_LockMutex(dpsoftrast.drawmutex);
-#endif
-       for (i = 0; i < dpsoftrast.numthreads; i++)
+       if (dpsoftrast.usethreads) 
         {
         {
-               thread = &dpsoftrast.threads[i];
-#ifdef USE_THREADS
-               while (thread->commandoffset != dpsoftrast.drawcommand)
+               for (i = 0; i < dpsoftrast.numthreads; i++)
                 {
                 {
-                       thread->waiting = true;
-                       SDL_CondBroadcast(dpsoftrast.drawcond);
-                       SDL_CondWait(thread->waitcond, dpsoftrast.drawmutex);
-                       thread->waiting = false;
+                       thread = &dpsoftrast.threads[i];
+                       if (thread->commandoffset != dpsoftrast.drawcommand)
+                       {
+                               Thread_LockMutex(thread->drawmutex);
+                               if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
+                                       Thread_CondSignal(thread->drawcond);
+                               Thread_UnlockMutex(thread->drawmutex);
+                       }
+               }
+               for (i = 0; i < dpsoftrast.numthreads; i++)
+               {
+                       thread = &dpsoftrast.threads[i];
+                       if (thread->commandoffset != dpsoftrast.drawcommand)
+                       {
+                               Thread_LockMutex(thread->drawmutex);
+                               if (thread->commandoffset != dpsoftrast.drawcommand)
+                               {
+                                       thread->waiting = true;
+                                       Thread_CondWait(thread->waitcond, thread->drawmutex);
+                                       thread->waiting = false;
+                               }
+                               Thread_UnlockMutex(thread->drawmutex);
+                       }
+               }
+       }
+       else
+       {
+               for (i = 0; i < dpsoftrast.numthreads; i++)
+               {
+                       thread = &dpsoftrast.threads[i];
+                       if (thread->commandoffset != dpsoftrast.drawcommand)
+                               DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
                 }
                 }
-#else
-               if (thread->commandoffset != dpsoftrast.drawcommand)
-                       DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
-#endif
         }
         }
-#ifdef USE_THREADS
-       SDL_UnlockMutex(dpsoftrast.drawmutex);
-#endif
         dpsoftrast.commandpool.usedcommands = 0;
  }
  
         dpsoftrast.commandpool.usedcommands = 0;
  }
  
@@ -4847,7 +4984,7 @@ void DPSOFTRAST_Finish(void)
         DPSOFTRAST_Flush();
  }
  
         DPSOFTRAST_Flush();
  }
  
-void DPSOFTRAST_Init(int width, int height, int numthreads, unsigned int *colorpixels, unsigned int *depthpixels)
+int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
  {
         int i;
         union
  {
         int i;
         union
@@ -4878,13 +5015,9 @@ void DPSOFTRAST_Init(int width, int height, int numthreads, unsigned int *colorp
         dpsoftrast.color[1] = 1;
         dpsoftrast.color[2] = 1;
         dpsoftrast.color[3] = 1;
         dpsoftrast.color[1] = 1;
         dpsoftrast.color[2] = 1;
         dpsoftrast.color[3] = 1;
-#ifdef USE_THREADS
-       dpsoftrast.numthreads = bound(1, numthreads, 64);
-       dpsoftrast.drawmutex = SDL_CreateMutex();
-       dpsoftrast.drawcond = SDL_CreateCond();
-#else
-       dpsoftrast.numthreads = 1;
-#endif
+       dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
+       dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
+       dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
         dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
         for (i = 0; i < dpsoftrast.numthreads; i++)
         {
         dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
         for (i = 0; i < dpsoftrast.numthreads; i++)
         {
@@ -4915,48 +5048,59 @@ void DPSOFTRAST_Init(int width, int height, int numthreads, unsigned int *colorp
                 thread->depthrange[1] = 1;
                 thread->polygonoffset[0] = 0;
                 thread->polygonoffset[1] = 0;
                 thread->depthrange[1] = 1;
                 thread->polygonoffset[0] = 0;
                 thread->polygonoffset[1] = 0;
+       
+               if (dpsoftrast.interlace)
+               {
+                       thread->miny1 = (i*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
+                       thread->maxy1 = ((i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
+                       thread->miny2 = ((dpsoftrast.numthreads+i)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
+                       thread->maxy2 = ((dpsoftrast.numthreads+i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
+               }
+               else
+               {
+                       thread->miny1 = thread->miny2 = (i*dpsoftrast.fb_height)/dpsoftrast.numthreads;
+                       thread->maxy1 = thread->maxy2 = ((i+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
+               }
  
                 thread->numspans = 0;
                 thread->numtriangles = 0;
                 thread->commandoffset = 0;
                 thread->waiting = false;
  
                 thread->numspans = 0;
                 thread->numtriangles = 0;
                 thread->commandoffset = 0;
                 thread->waiting = false;
-#ifdef USE_THREADS
-               thread->waitcond = SDL_CreateCond();
-#endif
-
+               thread->starving = false;
+          
                 thread->validate = -1;
                 DPSOFTRAST_Validate(thread, -1);
                 thread->validate = -1;
                 DPSOFTRAST_Validate(thread, -1);
-#ifdef USE_THREADS
-               thread->thread = SDL_CreateThread(DPSOFTRAST_Draw_Thread, thread);
-#endif
+ 
+               if (dpsoftrast.usethreads)
+               {
+                       thread->waitcond = Thread_CreateCond();
+                       thread->drawcond = Thread_CreateCond();
+                       thread->drawmutex = Thread_CreateMutex();
+                       thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
+               }
         }
         }
+       return 0;
  }
  
  void DPSOFTRAST_Shutdown(void)
  {
         int i;
  }
  
  void DPSOFTRAST_Shutdown(void)
  {
         int i;
-#ifdef USE_THREADS
-       if(dpsoftrast.numthreads > 0)
+       if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
         {
                 DPSOFTRAST_State_Thread *thread;
         {
                 DPSOFTRAST_State_Thread *thread;
-               SDL_LockMutex(dpsoftrast.drawmutex);
                 for (i = 0; i < dpsoftrast.numthreads; i++)
                 {
                         thread = &dpsoftrast.threads[i];
                 for (i = 0; i < dpsoftrast.numthreads; i++)
                 {
                         thread = &dpsoftrast.threads[i];
+                       Thread_LockMutex(thread->drawmutex);
                         thread->index = -1;
                         thread->index = -1;
+                       Thread_CondSignal(thread->drawcond);
+                       Thread_UnlockMutex(thread->drawmutex);
+                       Thread_WaitThread(thread->thread, 0);
+                       Thread_DestroyCond(thread->waitcond);
+                       Thread_DestroyCond(thread->drawcond);
+                       Thread_DestroyMutex(thread->drawmutex);
                 }
                 }
-               SDL_CondBroadcast(dpsoftrast.drawcond);
-               SDL_UnlockMutex(dpsoftrast.drawmutex);
-               for (i = 0; i < dpsoftrast.numthreads; i++)
-               {
-                       thread = &dpsoftrast.threads[i];
-                       SDL_WaitThread(thread->thread, NULL);
-                       SDL_DestroyCond(thread->waitcond);
-               }
-               SDL_DestroyMutex(dpsoftrast.drawmutex);
-               SDL_DestroyCond(dpsoftrast.drawcond);
         }
         }
-#endif
         for (i = 0;i < dpsoftrast.texture_end;i++)
                 if (dpsoftrast.texture[i].bytes)
                         MM_FREE(dpsoftrast.texture[i].bytes);
         for (i = 0;i < dpsoftrast.texture_end;i++)
                 if (dpsoftrast.texture[i].bytes)
                         MM_FREE(dpsoftrast.texture[i].bytes);