]> git.xonotic.org Git - xonotic/darkplaces.git/blob - dpsoftrast.c
changed pixelmask processing in DPSOFTRAST_Draw_Span_FinishBGRA8 to
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "thread.h"
7 #include "dpsoftrast.h"
8
9 #ifdef _MSC_VER
10 #pragma warning(disable : 4324)
11 #endif
12
13 #ifndef __cplusplus
14 typedef qboolean bool;
15 #endif
16
17 #define ALIGN_SIZE 16
18 #define ATOMIC_SIZE 32
19
20 #ifdef SSE_POSSIBLE
21         #if defined(__APPLE__)
22                 #include <libkern/OSAtomic.h>
23                 #define ALIGN(var) var __attribute__((__aligned__(16)))
24                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
25                 #define MEMORY_BARRIER (_mm_sfence())
26                 #define ATOMIC_COUNTER volatile int32_t 
27                 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28                 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29                 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30         #elif defined(__GNUC__)
31                 #define ALIGN(var) var __attribute__((__aligned__(16)))
32                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
33                 #define MEMORY_BARRIER (_mm_sfence())
34                 //(__sync_synchronize())
35                 #define ATOMIC_COUNTER volatile int
36                 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
37                 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
38                 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
39         #elif defined(_MSC_VER)
40                 #define ALIGN(var) __declspec(align(16)) var
41                 #define ATOMIC(var) __declspec(align(32)) var
42                 #define MEMORY_BARRIER (_mm_sfence())
43                 //(MemoryBarrier())
44                 #define ATOMIC_COUNTER volatile LONG
45                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
46                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
47                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
48         #endif
49 #endif
50
51 #ifndef ALIGN
52 #define ALIGN(var) var
53 #endif
54 #ifndef ATOMIC
55 #define ATOMIC(var) var
56 #endif
57 #ifndef MEMORY_BARRIER
58 #define MEMORY_BARRIER ((void)0)
59 #endif
60 #ifndef ATOMIC_COUNTER
61 #define ATOMIC_COUNTER int
62 #endif
63 #ifndef ATOMIC_INCREMENT
64 #define ATOMIC_INCREMENT(counter) (++(counter))
65 #endif
66 #ifndef ATOMIC_DECREMENT
67 #define ATOMIC_DECREMENT(counter) (--(counter))
68 #endif
69 #ifndef ATOMIC_ADD
70 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
71 #endif
72
73 #ifdef SSE_POSSIBLE
74 #include <emmintrin.h>
75
76 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
77
78 static void *MM_CALLOC(size_t nmemb, size_t size)
79 {
80         void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
81         if (ptr != NULL) memset(ptr, 0, nmemb*size);
82         return ptr;
83 }
84
85 #define MM_FREE _mm_free
86 #else
87 #define MM_MALLOC(size) malloc(size)
88 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
89 #define MM_FREE free
90 #endif
91
92 typedef enum DPSOFTRAST_ARRAY_e
93 {
94         DPSOFTRAST_ARRAY_POSITION,
95         DPSOFTRAST_ARRAY_COLOR,
96         DPSOFTRAST_ARRAY_TEXCOORD0,
97         DPSOFTRAST_ARRAY_TEXCOORD1,
98         DPSOFTRAST_ARRAY_TEXCOORD2,
99         DPSOFTRAST_ARRAY_TEXCOORD3,
100         DPSOFTRAST_ARRAY_TEXCOORD4,
101         DPSOFTRAST_ARRAY_TEXCOORD5,
102         DPSOFTRAST_ARRAY_TEXCOORD6,
103         DPSOFTRAST_ARRAY_TEXCOORD7,
104         DPSOFTRAST_ARRAY_TOTAL
105 }
106 DPSOFTRAST_ARRAY;
107
108 typedef struct DPSOFTRAST_Texture_s
109 {
110         int flags;
111         int width;
112         int height;
113         int depth;
114         int sides;
115         DPSOFTRAST_TEXTURE_FILTER filter;
116         int mipmaps;
117         int size;
118         ATOMIC_COUNTER binds;
119         unsigned char *bytes;
120         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
121 }
122 DPSOFTRAST_Texture;
123
124 #define COMMAND_SIZE ALIGN_SIZE
125 #define COMMAND_ALIGN(var) ALIGN(var)
126
127 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
128 {
129         unsigned char opcode;
130         unsigned short commandsize;
131 }
132 DPSOFTRAST_Command);
133
134 enum { DPSOFTRAST_OPCODE_Reset = 0 };
135
136 #define DEFCOMMAND(opcodeval, name, fields) \
137         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
138         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
139         { \
140                 unsigned char opcode; \
141                 unsigned short commandsize; \
142                 fields \
143         } DPSOFTRAST_Command_##name );
144
145 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
146 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
147
148 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
149 {
150         int freecommand;
151         int usedcommands;
152         ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
153 }
154 DPSOFTRAST_State_Command_Pool);
155
156 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
157 {
158         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
159         float w[3];
160         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
161 }
162 DPSOFTRAST_State_Triangle);
163
164 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
165         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
166         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
167                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
168                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
169 }
170 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
171         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
172         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
173         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
174         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
175         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
176         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
177         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
178         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
179 }
180                                         
181 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
182
183 typedef ALIGN(struct DPSOFTRAST_State_Span_s
184 {
185         int triangle; // triangle this span was generated by
186         int x; // framebuffer x coord
187         int y; // framebuffer y coord
188         int startx; // usable range (according to pixelmask)
189         int endx; // usable range (according to pixelmask)
190         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
191 }
192 DPSOFTRAST_State_Span);
193
194 #define DPSOFTRAST_DRAW_MAXSPANS 1024
195 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
196
197 #define DPSOFTRAST_VALIDATE_FB 1
198 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
199 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
200 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
201
202 typedef enum DPSOFTRAST_BLENDMODE_e
203 {
204         DPSOFTRAST_BLENDMODE_OPAQUE,
205         DPSOFTRAST_BLENDMODE_ALPHA,
206         DPSOFTRAST_BLENDMODE_ADDALPHA,
207         DPSOFTRAST_BLENDMODE_ADD,
208         DPSOFTRAST_BLENDMODE_INVMOD,
209         DPSOFTRAST_BLENDMODE_MUL,
210         DPSOFTRAST_BLENDMODE_MUL2,
211         DPSOFTRAST_BLENDMODE_SUBALPHA,
212         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
213         DPSOFTRAST_BLENDMODE_INVADD,
214         DPSOFTRAST_BLENDMODE_TOTAL
215 }
216 DPSOFTRAST_BLENDMODE;
217
218 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
219 {
220         void *thread;
221         int index;
222         
223         int cullface;
224         int colormask[4];
225         int blendfunc[2];
226         int blendsubtract;
227         int depthmask;
228         int depthtest;
229         int depthfunc;
230         int scissortest;
231         int alphatest;
232         int alphafunc;
233         float alphavalue;
234         int viewport[4];
235         int scissor[4];
236         float depthrange[2];
237         float polygonoffset[2];
238
239         int shader_mode;
240         int shader_permutation;
241         int shader_exactspecularmath;
242
243         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
244         
245         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
246         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
247
248         // DPSOFTRAST_VALIDATE_ flags
249         int validate;
250
251         // derived values (DPSOFTRAST_VALIDATE_FB)
252         int fb_colormask;
253         int fb_scissor[4];
254         ALIGN(float fb_viewportcenter[4]);
255         ALIGN(float fb_viewportscale[4]);
256
257         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
258         int fb_depthfunc;
259
260         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
261         int fb_blendmode;
262
263         // band boundaries
264         int miny1;
265         int maxy1;
266         int miny2;
267         int maxy2;
268
269         ATOMIC(volatile int commandoffset);
270
271         volatile bool waiting;
272         volatile bool starving;
273         void *waitcond;
274         void *drawcond;
275         void *drawmutex;
276
277         int numspans;
278         int numtriangles;
279         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
280         DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
281 }
282 DPSOFTRAST_State_Thread);
283
284 typedef ATOMIC(struct DPSOFTRAST_State_s
285 {
286         int fb_width;
287         int fb_height;
288         unsigned int *fb_depthpixels;
289         unsigned int *fb_colorpixels[4];
290
291         int viewport[4];
292         ALIGN(float fb_viewportcenter[4]);
293         ALIGN(float fb_viewportscale[4]);
294
295         float color[4];
296         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
297         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
298
299         const float *pointer_vertex3f;
300         const float *pointer_color4f;
301         const unsigned char *pointer_color4ub;
302         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
303         int stride_vertex;
304         int stride_color;
305         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
306         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
307         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
308
309         int firstvertex;
310         int numvertices;
311         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
312         float *screencoord4f;
313         int drawstarty;
314         int drawendy;
315         int drawclipped;
316         
317         int shader_mode;
318         int shader_permutation;
319         int shader_exactspecularmath;
320
321         int texture_max;
322         int texture_end;
323         int texture_firstfree;
324         DPSOFTRAST_Texture *texture;
325
326         int bigendian;
327
328         // error reporting
329         const char *errorstring;
330
331         bool usethreads;
332         int interlace;
333         int numthreads;
334         DPSOFTRAST_State_Thread *threads;
335
336         ATOMIC(volatile int drawcommand);
337
338         DPSOFTRAST_State_Command_Pool commandpool;
339 }
340 DPSOFTRAST_State);
341
342 DPSOFTRAST_State dpsoftrast;
343
344 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
345 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
346 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
347 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
348 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
349
350 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
351 {
352         fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
353         fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
354         fb_viewportcenter[3] = 0.5f;
355         fb_viewportcenter[0] = 0.0f;
356         fb_viewportscale[1] = 0.5f * viewport[2];
357         fb_viewportscale[2] = -0.5f * viewport[3];
358         fb_viewportscale[3] = 0.5f;
359         fb_viewportscale[0] = 1.0f;
360 }
361
362 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
363 {
364         if (dpsoftrast.interlace)
365         {
366                 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
367                 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
368                 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
369                 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
370         }
371         else
372         {
373                 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
374                 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
375         }
376 }
377
378 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
379 {
380         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
381         // and viewport projection values
382         int x1, x2;
383         int y1, y2;
384         x1 = thread->scissor[0];
385         x2 = thread->scissor[0] + thread->scissor[2];
386         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
387         y2 = dpsoftrast.fb_height - thread->scissor[1];
388         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
389         if (x1 < 0) x1 = 0;
390         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
391         if (y1 < 0) y1 = 0;
392         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
393         thread->fb_scissor[0] = x1;
394         thread->fb_scissor[1] = y1;
395         thread->fb_scissor[2] = x2 - x1;
396         thread->fb_scissor[3] = y2 - y1;
397
398         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
399         DPSOFTRAST_RecalcThread(thread);
400 }
401
402 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
403 {
404         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
405 }
406
407 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
408 {
409         if (thread->blendsubtract)
410         {
411                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
412                 {
413                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
414                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
415                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
416                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
417                 }
418         }
419         else
420         {       
421                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
422                 {
423                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
424                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
425                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
426                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
427                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
428                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
429                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
430                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
431                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
432                 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
433                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
434                 }
435         }
436 }
437
438 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
439
440 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
441 {
442         mask &= thread->validate;
443         if (!mask)
444                 return;
445         if (mask & DPSOFTRAST_VALIDATE_FB)
446         {
447                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
448                 DPSOFTRAST_RecalcFB(thread);
449         }
450         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
451         {
452                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
453                 DPSOFTRAST_RecalcDepthFunc(thread);
454         }
455         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
456         {
457                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
458                 DPSOFTRAST_RecalcBlendFunc(thread);
459         }
460 }
461
462 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
463 {
464         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
465                 return &dpsoftrast.texture[index];
466         return NULL;
467 }
468
469 static void DPSOFTRAST_Texture_Grow(void)
470 {
471         DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
472         DPSOFTRAST_State_Thread *thread;
473         int i;
474         int j;
475         DPSOFTRAST_Flush();
476         // expand texture array as needed
477         if (dpsoftrast.texture_max < 1024)
478                 dpsoftrast.texture_max = 1024;
479         else
480                 dpsoftrast.texture_max *= 2;
481         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
482         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
483                 if (dpsoftrast.texbound[i])
484                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
485         for (j = 0; j < dpsoftrast.numthreads; j++)
486         {
487                 thread = &dpsoftrast.threads[j];
488                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
489                         if (thread->texbound[i])
490                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
491         }
492 }
493
494 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
495 {
496         int w;
497         int h;
498         int d;
499         int size;
500         int s;
501         int texnum;
502         int mipmaps;
503         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
504         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
505         DPSOFTRAST_Texture *texture;
506         if (width*height*depth < 1)
507         {
508                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
509                 return 0;
510         }
511         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
512         {
513                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
514                 return 0;
515         }
516         switch(texformat)
517         {
518         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
519         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
520         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
521                 break;
522         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
523                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
524                 {
525                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
526                         return 0;
527                 }
528                 if (depth != 1)
529                 {
530                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
531                         return 0;
532                 }
533                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
534                 {
535                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
536                         return 0;
537                 }
538                 break;
539         }
540         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
541         {
542                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
543                 return 0;
544         }
545         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
546         {
547                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
548                 return 0;
549         }
550         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
551         {
552                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
553                 return 0;
554         }
555         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
556         {
557                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
558                 return 0;
559         }
560         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
561         {
562                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
563                 return 0;
564         }
565         // find first empty slot in texture array
566         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
567                 if (!dpsoftrast.texture[texnum].bytes)
568                         break;
569         dpsoftrast.texture_firstfree = texnum + 1;
570         if (dpsoftrast.texture_max <= texnum)
571                 DPSOFTRAST_Texture_Grow();
572         if (dpsoftrast.texture_end <= texnum)
573                 dpsoftrast.texture_end = texnum + 1;
574         texture = &dpsoftrast.texture[texnum];
575         memset(texture, 0, sizeof(*texture));
576         texture->flags = flags;
577         texture->width = width;
578         texture->height = height;
579         texture->depth = depth;
580         texture->sides = sides;
581         texture->binds = 0;
582         w = width;
583         h = height;
584         d = depth;
585         size = 0;
586         mipmaps = 0;
587         w = width;
588         h = height;
589         d = depth;
590         for (;;)
591         {
592                 s = w * h * d * sides * 4;
593                 texture->mipmap[mipmaps][0] = size;
594                 texture->mipmap[mipmaps][1] = s;
595                 texture->mipmap[mipmaps][2] = w;
596                 texture->mipmap[mipmaps][3] = h;
597                 texture->mipmap[mipmaps][4] = d;
598                 size += s;
599                 mipmaps++;
600                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
601                         break;
602                 if (w > 1) w >>= 1;
603                 if (h > 1) h >>= 1;
604                 if (d > 1) d >>= 1;
605         }
606         texture->mipmaps = mipmaps;
607         texture->size = size;
608
609         // allocate the pixels now
610         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
611
612         return texnum;
613 }
614 void DPSOFTRAST_Texture_Free(int index)
615 {
616         DPSOFTRAST_Texture *texture;
617         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
618         if (texture->binds)
619                 DPSOFTRAST_Flush();
620         if (texture->bytes)
621                 MM_FREE(texture->bytes);
622         texture->bytes = NULL;
623         memset(texture, 0, sizeof(*texture));
624         // adjust the free range and used range
625         if (dpsoftrast.texture_firstfree > index)
626                 dpsoftrast.texture_firstfree = index;
627         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
628                 dpsoftrast.texture_end--;
629 }
630 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
631 {
632         int i, x, y, z, w, layer0, layer1, row0, row1;
633         unsigned char *o, *i0, *i1, *i2, *i3;
634         DPSOFTRAST_Texture *texture;
635         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
636         if (texture->mipmaps <= 1)
637                 return;
638         for (i = 1;i < texture->mipmaps;i++)
639         {
640                 for (z = 0;z < texture->mipmap[i][4];z++)
641                 {
642                         layer0 = z*2;
643                         layer1 = z*2+1;
644                         if (layer1 >= texture->mipmap[i-1][4])
645                                 layer1 = texture->mipmap[i-1][4]-1;
646                         for (y = 0;y < texture->mipmap[i][3];y++)
647                         {
648                                 row0 = y*2;
649                                 row1 = y*2+1;
650                                 if (row1 >= texture->mipmap[i-1][3])
651                                         row1 = texture->mipmap[i-1][3]-1;
652                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
653                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
654                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
655                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
656                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
657                                 w = texture->mipmap[i][2];
658                                 if (layer1 > layer0)
659                                 {
660                                         if (texture->mipmap[i-1][2] > 1)
661                                         {
662                                                 // average 3D texture
663                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
664                                                 {
665                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
666                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
667                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
668                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
669                                                 }
670                                         }
671                                         else
672                                         {
673                                                 // average 3D mipmap with parent width == 1
674                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
675                                                 {
676                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
677                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
678                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
679                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
680                                                 }
681                                         }
682                                 }
683                                 else
684                                 {
685                                         if (texture->mipmap[i-1][2] > 1)
686                                         {
687                                                 // average 2D texture (common case)
688                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
689                                                 {
690                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
691                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
692                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
693                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
694                                                 }
695                                         }
696                                         else
697                                         {
698                                                 // 2D texture with parent width == 1
699                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
700                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
701                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
702                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
703                                         }
704                                 }
705                         }
706                 }
707         }
708 }
709 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
710 {
711         DPSOFTRAST_Texture *texture;
712         unsigned char *dst;
713         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
714         if (texture->binds)
715                 DPSOFTRAST_Flush();
716         if (pixels)
717         {
718                 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
719                 while (blockheight > 0)
720                 {
721                         memcpy(dst, pixels, blockwidth * 4);
722                         pixels += blockwidth * 4;
723                         dst += texture->mipmap[0][2] * 4;
724                         blockheight--;
725                 }
726         }
727         DPSOFTRAST_Texture_CalculateMipmaps(index);
728 }
729 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
730 {
731         DPSOFTRAST_Texture *texture;
732         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
733         if (texture->binds)
734                 DPSOFTRAST_Flush();
735         if (pixels)
736                 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
737         DPSOFTRAST_Texture_CalculateMipmaps(index);
738 }
739 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
740 {
741         DPSOFTRAST_Texture *texture;
742         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
743         return texture->mipmap[mip][2];
744 }
745 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
746 {
747         DPSOFTRAST_Texture *texture;
748         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
749         return texture->mipmap[mip][3];
750 }
751 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
752 {
753         DPSOFTRAST_Texture *texture;
754         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
755         return texture->mipmap[mip][4];
756 }
757 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
758 {
759         DPSOFTRAST_Texture *texture;
760         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
761         if (texture->binds)
762                 DPSOFTRAST_Flush();
763         return texture->bytes + texture->mipmap[mip][0];
764 }
765 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
766 {
767         DPSOFTRAST_Texture *texture;
768         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
769         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
770         {
771                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
772                 return;
773         }
774         if (texture->binds)
775                 DPSOFTRAST_Flush();
776         texture->filter = filter;
777 }
778
779 static void DPSOFTRAST_Draw_FlushThreads(void);
780
781 static void DPSOFTRAST_Draw_SyncCommands(void)
782 {
783         if(dpsoftrast.usethreads) MEMORY_BARRIER;
784         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
785 }
786
787 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
788 {
789         DPSOFTRAST_State_Thread *thread;
790         int i;
791         int freecommand = dpsoftrast.commandpool.freecommand;
792         int usedcommands = dpsoftrast.commandpool.usedcommands;
793         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
794                 return;
795         DPSOFTRAST_Draw_SyncCommands();
796         for(;;)
797         {
798                 int waitindex = -1;
799                 int commandoffset;
800                 usedcommands = 0;
801                 for (i = 0; i < dpsoftrast.numthreads; i++)
802                 {
803                         thread = &dpsoftrast.threads[i]; 
804                         commandoffset = freecommand - thread->commandoffset;
805                         if (commandoffset < 0)
806                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
807                         if (commandoffset > usedcommands)
808                         {
809                                 waitindex = i;
810                                 usedcommands = commandoffset;
811                         }
812                 }
813                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
814                         break;
815                 thread = &dpsoftrast.threads[waitindex];
816                 Thread_LockMutex(thread->drawmutex);
817                 if (thread->commandoffset != dpsoftrast.drawcommand)
818                 {
819                         thread->waiting = true;
820                         if (thread->starving) Thread_CondSignal(thread->drawcond);
821                         Thread_CondWait(thread->waitcond, thread->drawmutex);
822                         thread->waiting = false;
823                 }
824                 Thread_UnlockMutex(thread->drawmutex);
825         }
826         dpsoftrast.commandpool.usedcommands = usedcommands;
827 }
828
829 #define DPSOFTRAST_ALIGNCOMMAND(size) \
830         ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
831 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
832         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
833
834 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
835 {
836         DPSOFTRAST_Command *command;
837         int freecommand = dpsoftrast.commandpool.freecommand;
838         int usedcommands = dpsoftrast.commandpool.usedcommands;
839         int extra = sizeof(DPSOFTRAST_Command);
840         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
841                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
842         if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
843         {
844                 if (dpsoftrast.usethreads)
845                         DPSOFTRAST_Draw_FreeCommandPool(size + extra);
846                 else
847                         DPSOFTRAST_Draw_FlushThreads();
848                 freecommand = dpsoftrast.commandpool.freecommand;
849                 usedcommands = dpsoftrast.commandpool.usedcommands;
850         }
851         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
852         {
853                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
854                 command->opcode = DPSOFTRAST_OPCODE_Reset;
855                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
856                 freecommand = 0;
857         }
858         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
859         command->opcode = opcode;
860         command->commandsize = size;
861         freecommand += size;
862         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
863                 freecommand = 0;
864         dpsoftrast.commandpool.freecommand = freecommand;
865         dpsoftrast.commandpool.usedcommands = usedcommands + size;
866         return command;
867 }
868
869 static void DPSOFTRAST_UndoCommand(int size)
870 {
871         int freecommand = dpsoftrast.commandpool.freecommand;
872         int usedcommands = dpsoftrast.commandpool.usedcommands;
873         freecommand -= size;
874         if (freecommand < 0)
875                 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
876         usedcommands -= size;
877         dpsoftrast.commandpool.freecommand = freecommand;
878         dpsoftrast.commandpool.usedcommands = usedcommands;
879 }
880                 
881 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
882 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
883 {
884         thread->viewport[0] = command->x;
885         thread->viewport[1] = command->y;
886         thread->viewport[2] = command->width;
887         thread->viewport[3] = command->height;
888         thread->validate |= DPSOFTRAST_VALIDATE_FB;
889 }
890 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
891 {
892         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
893         command->x = x;
894         command->y = y;
895         command->width = width;
896         command->height = height;
897
898         dpsoftrast.viewport[0] = x;
899         dpsoftrast.viewport[1] = y;
900         dpsoftrast.viewport[2] = width;
901         dpsoftrast.viewport[3] = height;
902         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
903 }
904
905 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
906 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
907 {
908         int i, x1, y1, x2, y2, w, h, x, y;
909         int miny1, maxy1, miny2, maxy2;
910         int bandy;
911         unsigned int *p;
912         unsigned int c;
913         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
914         miny1 = thread->miny1;
915         maxy1 = thread->maxy1;
916         miny2 = thread->miny2;
917         maxy2 = thread->maxy2;
918         x1 = thread->fb_scissor[0];
919         y1 = thread->fb_scissor[1];
920         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
921         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
922         if (y1 < miny1) y1 = miny1;
923         if (y2 > maxy2) y2 = maxy2;
924         w = x2 - x1;
925         h = y2 - y1;
926         if (w < 1 || h < 1)
927                 return;
928         // FIXME: honor fb_colormask?
929         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
930         for (i = 0;i < 4;i++)
931         {
932                 if (!dpsoftrast.fb_colorpixels[i])
933                         continue;
934                 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
935                 for (;y < bandy;y++)
936                 {
937                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
938                         for (x = x1;x < x2;x++)
939                                 p[x] = c;
940                 }
941         }
942 }
943 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
944 {
945         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
946         command->r = r;
947         command->g = g;
948         command->b = b;
949         command->a = a;
950 }
951
952 DEFCOMMAND(3, ClearDepth, float depth;)
953 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
954 {
955         int x1, y1, x2, y2, w, h, x, y;
956         int miny1, maxy1, miny2, maxy2;
957         int bandy;
958         unsigned int *p;
959         unsigned int c;
960         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
961         miny1 = thread->miny1;
962         maxy1 = thread->maxy1;
963         miny2 = thread->miny2;
964         maxy2 = thread->maxy2;
965         x1 = thread->fb_scissor[0];
966         y1 = thread->fb_scissor[1];
967         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
968         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
969         if (y1 < miny1) y1 = miny1;
970         if (y2 > maxy2) y2 = maxy2;
971         w = x2 - x1;
972         h = y2 - y1;
973         if (w < 1 || h < 1)
974                 return;
975         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
976         for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
977         for (;y < bandy;y++)
978         {
979                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
980                 for (x = x1;x < x2;x++)
981                         p[x] = c;
982         }
983 }
984 void DPSOFTRAST_ClearDepth(float d)
985 {
986         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
987         command->depth = d;
988 }
989
990 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
991 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
992 {
993         thread->colormask[0] = command->r != 0;
994         thread->colormask[1] = command->g != 0;
995         thread->colormask[2] = command->b != 0;
996         thread->colormask[3] = command->a != 0;
997         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
998 }
999 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1000 {
1001         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1002         command->r = r;
1003         command->g = g;
1004         command->b = b;
1005         command->a = a;
1006 }
1007
1008 DEFCOMMAND(5, DepthTest, int enable;)
1009 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1010 {
1011         thread->depthtest = command->enable;
1012         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1013 }
1014 void DPSOFTRAST_DepthTest(int enable)
1015 {
1016         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1017         command->enable = enable;
1018 }
1019
1020 DEFCOMMAND(6, ScissorTest, int enable;)
1021 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1022 {
1023         thread->scissortest = command->enable;
1024         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1025 }
1026 void DPSOFTRAST_ScissorTest(int enable)
1027 {
1028         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1029         command->enable = enable;
1030 }
1031
1032 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1033 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1034 {
1035         thread->scissor[0] = command->x;
1036         thread->scissor[1] = command->y;
1037         thread->scissor[2] = command->width;
1038         thread->scissor[3] = command->height;
1039         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1040 }
1041 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1042 {
1043         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1044         command->x = x;
1045         command->y = y;
1046         command->width = width;
1047         command->height = height;
1048 }
1049
1050 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1051 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1052 {
1053         thread->blendfunc[0] = command->sfactor;
1054         thread->blendfunc[1] = command->dfactor;
1055         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1056 }
1057 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1058 {
1059         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1060         command->sfactor = sfactor;
1061         command->dfactor = dfactor;
1062 }
1063
1064 DEFCOMMAND(9, BlendSubtract, int enable;)
1065 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1066 {
1067         thread->blendsubtract = command->enable;
1068         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1069 }
1070 void DPSOFTRAST_BlendSubtract(int enable)
1071 {
1072         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1073         command->enable = enable;
1074 }
1075
1076 DEFCOMMAND(10, DepthMask, int enable;)
1077 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1078 {
1079         thread->depthmask = command->enable;
1080 }
1081 void DPSOFTRAST_DepthMask(int enable)
1082 {
1083         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1084         command->enable = enable;
1085 }
1086
1087 DEFCOMMAND(11, DepthFunc, int func;)
1088 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1089 {
1090         thread->depthfunc = command->func;
1091 }
1092 void DPSOFTRAST_DepthFunc(int func)
1093 {
1094         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1095         command->func = func;
1096 }
1097
1098 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1099 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1100 {
1101         thread->depthrange[0] = command->nearval;
1102         thread->depthrange[1] = command->farval;
1103 }
1104 void DPSOFTRAST_DepthRange(float nearval, float farval)
1105 {
1106         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1107         command->nearval = nearval;
1108         command->farval = farval;
1109 }
1110
1111 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1112 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1113 {
1114         thread->polygonoffset[0] = command->alongnormal;
1115         thread->polygonoffset[1] = command->intoview;
1116 }
1117 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1118 {
1119         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1120         command->alongnormal = alongnormal;
1121         command->intoview = intoview;
1122 }
1123
1124 DEFCOMMAND(14, CullFace, int mode;)
1125 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1126 {
1127         thread->cullface = command->mode;
1128 }
1129 void DPSOFTRAST_CullFace(int mode)
1130 {
1131         DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1132         command->mode = mode;
1133 }
1134
1135 DEFCOMMAND(15, AlphaTest, int enable;)
1136 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1137 {
1138         thread->alphatest = command->enable;
1139 }
1140 void DPSOFTRAST_AlphaTest(int enable)
1141 {
1142         DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1143         command->enable = enable;
1144 }
1145
1146 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1147 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1148 {
1149         thread->alphafunc = command->func;
1150         thread->alphavalue = command->ref;
1151 }
1152 void DPSOFTRAST_AlphaFunc(int func, float ref)
1153 {
1154         DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1155         command->func = func;
1156         command->ref = ref;
1157 }
1158
1159 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1160 {
1161         dpsoftrast.color[0] = r;
1162         dpsoftrast.color[1] = g;
1163         dpsoftrast.color[2] = b;
1164         dpsoftrast.color[3] = a;
1165 }
1166
1167 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1168 {
1169         int outstride = blockwidth * 4;
1170         int instride = dpsoftrast.fb_width * 4;
1171         int bx1 = blockx;
1172         int by1 = blocky;
1173         int bx2 = blockx + blockwidth;
1174         int by2 = blocky + blockheight;
1175         int bw;
1176         int x;
1177         int y;
1178         unsigned char *inpixels;
1179         unsigned char *b;
1180         unsigned char *o;
1181         DPSOFTRAST_Flush();
1182         if (bx1 < 0) bx1 = 0;
1183         if (by1 < 0) by1 = 0;
1184         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1185         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1186         bw = bx2 - bx1;
1187         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1188         if (dpsoftrast.bigendian)
1189         {
1190                 for (y = by1;y < by2;y++)
1191                 {
1192                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1193                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1194                         for (x = bx1;x < bx2;x++)
1195                         {
1196                                 o[0] = b[3];
1197                                 o[1] = b[2];
1198                                 o[2] = b[1];
1199                                 o[3] = b[0];
1200                                 o += 4;
1201                                 b += 4;
1202                         }
1203                 }
1204         }
1205         else
1206         {
1207                 for (y = by1;y < by2;y++)
1208                 {
1209                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1210                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1211                         memcpy(o, b, bw*4);
1212                 }
1213         }
1214
1215 }
1216 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1217 {
1218         int tx1 = tx;
1219         int ty1 = ty;
1220         int tx2 = tx + width;
1221         int ty2 = ty + height;
1222         int sx1 = sx;
1223         int sy1 = sy;
1224         int sx2 = sx + width;
1225         int sy2 = sy + height;
1226         int swidth;
1227         int sheight;
1228         int twidth;
1229         int theight;
1230         int sw;
1231         int sh;
1232         int tw;
1233         int th;
1234         int y;
1235         unsigned int *spixels;
1236         unsigned int *tpixels;
1237         DPSOFTRAST_Texture *texture;
1238         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1239         if (mip < 0 || mip >= texture->mipmaps) return;
1240         DPSOFTRAST_Flush();
1241         spixels = dpsoftrast.fb_colorpixels[0];
1242         swidth = dpsoftrast.fb_width;
1243         sheight = dpsoftrast.fb_height;
1244         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1245         twidth = texture->mipmap[mip][2];
1246         theight = texture->mipmap[mip][3];
1247         if (tx1 < 0) tx1 = 0;
1248         if (ty1 < 0) ty1 = 0;
1249         if (tx2 > twidth) tx2 = twidth;
1250         if (ty2 > theight) ty2 = theight;
1251         if (sx1 < 0) sx1 = 0;
1252         if (sy1 < 0) sy1 = 0;
1253         if (sx2 > swidth) sx2 = swidth;
1254         if (sy2 > sheight) sy2 = sheight;
1255         tw = tx2 - tx1;
1256         th = ty2 - ty1;
1257         sw = sx2 - sx1;
1258         sh = sy2 - sy1;
1259         if (tw > sw) tw = sw;
1260         if (th > sh) th = sh;
1261         if (tw < 1 || th < 1)
1262                 return;
1263         sy1 = sheight - 1 - sy1;
1264         for (y = 0;y < th;y++)
1265                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1266         if (texture->mipmaps > 1)
1267                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1268 }
1269
1270 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1271 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1272 {
1273         if (thread->texbound[command->unitnum])
1274                 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1275         thread->texbound[command->unitnum] = command->texture;
1276 }
1277 void DPSOFTRAST_SetTexture(int unitnum, int index)
1278 {
1279         DPSOFTRAST_Command_SetTexture *command;
1280         DPSOFTRAST_Texture *texture;
1281         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1282         {
1283                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1284                 return;
1285         }
1286         texture = DPSOFTRAST_Texture_GetByIndex(index);
1287         if (index && !texture)
1288         {
1289                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1290                 return;
1291         }
1292
1293         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1294         command->unitnum = unitnum;
1295         command->texture = texture;
1296
1297         dpsoftrast.texbound[unitnum] = texture;
1298         ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1299 }
1300
1301 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1302 {
1303         dpsoftrast.pointer_vertex3f = vertex3f;
1304         dpsoftrast.stride_vertex = stride;
1305 }
1306 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1307 {
1308         dpsoftrast.pointer_color4f = color4f;
1309         dpsoftrast.pointer_color4ub = NULL;
1310         dpsoftrast.stride_color = stride;
1311 }
1312 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1313 {
1314         dpsoftrast.pointer_color4f = NULL;
1315         dpsoftrast.pointer_color4ub = color4ub;
1316         dpsoftrast.stride_color = stride;
1317 }
1318 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1319 {
1320         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1321         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1322         dpsoftrast.stride_texcoord[unitnum] = stride;
1323 }
1324
1325 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1326 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1327 {
1328         thread->shader_mode = command->mode;
1329         thread->shader_permutation = command->permutation;
1330         thread->shader_exactspecularmath = command->exactspecularmath;
1331 }
1332 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1333 {
1334         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1335         command->mode = mode;
1336         command->permutation = permutation;
1337         command->exactspecularmath = exactspecularmath;
1338
1339         dpsoftrast.shader_mode = mode;
1340         dpsoftrast.shader_permutation = permutation;
1341         dpsoftrast.shader_exactspecularmath = exactspecularmath;
1342 }
1343
1344 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1345 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1346 {
1347         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1348 }
1349 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1350 {
1351         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1352         command->index = index;
1353         command->val[0] = v0;
1354         command->val[1] = v1;
1355         command->val[2] = v2;
1356         command->val[3] = v3;
1357
1358         dpsoftrast.uniform4f[index*4+0] = v0;
1359         dpsoftrast.uniform4f[index*4+1] = v1;
1360         dpsoftrast.uniform4f[index*4+2] = v2;
1361         dpsoftrast.uniform4f[index*4+3] = v3;
1362 }
1363 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1364 {
1365         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1366         command->index = index;
1367         memcpy(command->val, v, sizeof(command->val));
1368
1369         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1370 }
1371
1372 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1373 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1374 {
1375         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1376 }
1377 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1378 {
1379 #ifdef SSE_POSSIBLE
1380         int i, index;
1381         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1382         {
1383                 __m128 m0, m1, m2, m3;
1384                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1385                 command->index = (DPSOFTRAST_UNIFORM)index;
1386                 if (((size_t)v)&(ALIGN_SIZE-1))
1387                 {
1388                         m0 = _mm_loadu_ps(v);
1389                         m1 = _mm_loadu_ps(v+4);
1390                         m2 = _mm_loadu_ps(v+8);
1391                         m3 = _mm_loadu_ps(v+12);
1392                 }
1393                 else
1394                 {
1395                         m0 = _mm_load_ps(v);
1396                         m1 = _mm_load_ps(v+4);
1397                         m2 = _mm_load_ps(v+8);
1398                         m3 = _mm_load_ps(v+12);
1399                 }
1400                 if (transpose)
1401                 {
1402                         __m128 t0, t1, t2, t3;
1403                         t0 = _mm_unpacklo_ps(m0, m1);
1404                         t1 = _mm_unpacklo_ps(m2, m3);
1405                         t2 = _mm_unpackhi_ps(m0, m1);
1406                         t3 = _mm_unpackhi_ps(m2, m3);
1407                         m0 = _mm_movelh_ps(t0, t1);
1408                         m1 = _mm_movehl_ps(t1, t0);
1409                         m2 = _mm_movelh_ps(t2, t3);
1410                         m3 = _mm_movehl_ps(t3, t2);                     
1411                 }
1412                 _mm_store_ps(command->val, m0);
1413                 _mm_store_ps(command->val+4, m1);
1414                 _mm_store_ps(command->val+8, m2);
1415                 _mm_store_ps(command->val+12, m3);
1416                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1417                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1418                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1419                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1420         }
1421 #endif
1422 }
1423
1424 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1425 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1426 {
1427         thread->uniform1i[command->index] = command->val;
1428 }
1429 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1430 {
1431         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1432         command->index = index;
1433         command->val = i0;
1434
1435         dpsoftrast.uniform1i[command->index] = i0;
1436 }
1437
1438 #ifdef SSE_POSSIBLE
1439 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1440 {
1441         float *end = dst + size*4;
1442         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1443         {
1444                 while (dst < end)
1445                 {
1446                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1447                         dst += 4;
1448                         src += stride;
1449                 }
1450         }
1451         else
1452         {
1453                 while (dst < end)
1454                 {
1455                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1456                         dst += 4;
1457                         src += stride;
1458                 }
1459         }
1460 }
1461
1462 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1463 {
1464         float *end = dst + size*4;
1465         if (stride == sizeof(float[3]))
1466         {
1467                 float *end4 = dst + (size&~3)*4;        
1468                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1469                 {
1470                         while (dst < end4)
1471                         {
1472                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1473                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1474                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1475                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1476                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1477                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1478                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1479                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1480                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1481                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1482                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1483                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1484                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1485                                 dst += 16;
1486                                 src += 4*sizeof(float[3]);
1487                         }
1488                 }
1489                 else
1490                 {
1491                         while (dst < end4)
1492                         {
1493                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1494                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1495                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1496                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1497                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1498                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1499                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1500                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1501                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1502                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1503                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1504                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1505                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1506                                 dst += 16;
1507                                 src += 4*sizeof(float[3]);
1508                         }
1509                 }
1510         }
1511         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1512         {
1513                 while (dst < end)
1514                 {
1515                         __m128 v = _mm_loadu_ps((const float *)src);
1516                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1517                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1518                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1519                         _mm_store_ps(dst, v);
1520                         dst += 4;
1521                         src += stride;
1522                 }
1523         }
1524         else
1525         {
1526                 while (dst < end)
1527                 {
1528                         __m128 v = _mm_load_ps((const float *)src);
1529                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1530                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1531                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1532                         _mm_store_ps(dst, v);
1533                         dst += 4;
1534                         src += stride;
1535                 }
1536         }
1537 }
1538
1539 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1540 {
1541         float *end = dst + size*4;
1542         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1543         if (stride == sizeof(float[2]))
1544         {
1545                 float *end2 = dst + (size&~1)*4;
1546                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1547                 {
1548                         while (dst < end2)
1549                         {
1550                                 __m128 v = _mm_loadu_ps((const float *)src);
1551                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1552                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1553                                 dst += 8;
1554                                 src += 2*sizeof(float[2]);
1555                         }
1556                 }
1557                 else
1558                 {
1559                         while (dst < end2)
1560                         {
1561                                 __m128 v = _mm_load_ps((const float *)src);
1562                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1563                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1564                                 dst += 8;
1565                                 src += 2*sizeof(float[2]);
1566                         }
1567                 }
1568         }
1569         while (dst < end)
1570         {
1571                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1572                 dst += 4;
1573                 src += stride;
1574         }
1575 }
1576
1577 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1578 {
1579         float *end = dst + size*4;
1580         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1581         if (stride == sizeof(unsigned char[4]))
1582         {
1583                 float *end4 = dst + (size&~3)*4;
1584                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1585                 {
1586                         while (dst < end4)
1587                         {
1588                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1589                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1590                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1591                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1592                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1593                                 dst += 16;
1594                                 src += 4*sizeof(unsigned char[4]);
1595                         }
1596                 }
1597                 else
1598                 {
1599                         while (dst < end4)
1600                         {
1601                                 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1602                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1603                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1604                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1605                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1606                                 dst += 16;
1607                                 src += 4*sizeof(unsigned char[4]);
1608                         }
1609                 }
1610         }
1611         while (dst < end)
1612         {
1613                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1614                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1615                 dst += 4;
1616                 src += stride;
1617         }
1618 }
1619
1620 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1621 {
1622         float *end = dst + 4*size;
1623         __m128 v = _mm_loadu_ps(src);
1624         while (dst < end)
1625         {
1626                 _mm_store_ps(dst, v);
1627                 dst += 4;
1628         }
1629 }
1630 #endif
1631
1632 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1633 {
1634 #ifdef SSE_POSSIBLE
1635         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1636         __m128 m0, m1, m2, m3;
1637         float *end;
1638         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1639         {
1640                 // fast case for identity matrix
1641                 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1642                 return;
1643         }
1644         end = out4f + numitems*4;
1645         m0 = _mm_loadu_ps(inmatrix16f);
1646         m1 = _mm_loadu_ps(inmatrix16f + 4);
1647         m2 = _mm_loadu_ps(inmatrix16f + 8);
1648         m3 = _mm_loadu_ps(inmatrix16f + 12);
1649         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1650         {
1651                 while (out4f < end)
1652                 {
1653                         __m128 v = _mm_loadu_ps(in4f);
1654                         _mm_store_ps(out4f,
1655                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1656                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1657                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1658                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1659                         out4f += 4;
1660                         in4f += 4;
1661                 }
1662         }
1663         else
1664         {
1665                 while (out4f < end)
1666                 {
1667                         __m128 v = _mm_load_ps(in4f);
1668                         _mm_store_ps(out4f,
1669                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1670                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1671                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1672                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1673                         out4f += 4;
1674                         in4f += 4;
1675                 }
1676         }
1677 #endif
1678 }
1679
1680 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1681 {
1682         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1683 }
1684
1685 #ifdef SSE_POSSIBLE
1686 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1687 { \
1688         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1689         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1690         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1691         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1692 }
1693
1694 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1695 { \
1696         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1697         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1698         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1699         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1700 }
1701
1702 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1703 { \
1704         __m128 p = (in); \
1705         out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1706                                                   _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1707                                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1708                                                                                         _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1709 }
1710
1711 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1712 {
1713         int clipmask = 0xFF;
1714         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1715         __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1716         __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1717         __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1718         m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1719         m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1720         m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1721         m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1722         #define BBFRONT(k, pos) \
1723         { \
1724                 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1725                 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1726                 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1727                 { \
1728                         __m128 proj; \
1729                         clipmask &= ~(1<<k); \
1730                         proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1731                         minproj = _mm_min_ss(minproj, proj); \
1732                         maxproj = _mm_max_ss(maxproj, proj); \
1733                 } \
1734         }
1735         BBFRONT(0, minpos); 
1736         BBFRONT(1, _mm_move_ss(minpos, maxpos)); 
1737         BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1738         BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1739         BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1740         BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1741         BBFRONT(6, _mm_move_ss(maxpos, minpos)); 
1742         BBFRONT(7, maxpos);
1743         #define BBCLIP(k) \
1744         { \
1745                 if (clipmask&(1<<k)) \
1746                 { \
1747                         if (!(clipmask&(1<<(k^1)))) \
1748                         { \
1749                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1750                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1751                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1752                                 minproj = _mm_min_ss(minproj, proj); \
1753                                 maxproj = _mm_max_ss(maxproj, proj); \
1754                         } \
1755                         if (!(clipmask&(1<<(k^2)))) \
1756                         { \
1757                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1758                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1759                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1760                                 minproj = _mm_min_ss(minproj, proj); \
1761                                 maxproj = _mm_max_ss(maxproj, proj); \
1762                         } \
1763                         if (!(clipmask&(1<<(k^4)))) \
1764                         { \
1765                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1766                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1767                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1768                                 minproj = _mm_min_ss(minproj, proj); \
1769                                 maxproj = _mm_max_ss(maxproj, proj); \
1770                         } \
1771                 } \
1772         }
1773         BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1774         viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1775         viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1776         minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1777         maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1778         minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1779         maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1780         *starty = _mm_cvttss_si32(maxproj);
1781         *endy = _mm_cvttss_si32(minproj)+1;
1782         return clipmask;
1783 }
1784         
1785 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1786 {
1787         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1788         float *end = out4f + numitems*4;
1789         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1790         __m128 minpos, maxpos;
1791         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1792         {
1793                 minpos = maxpos = _mm_loadu_ps(in4f);
1794                 while (out4f < end)
1795                 {
1796                         __m128 v = _mm_loadu_ps(in4f);
1797                         minpos = _mm_min_ps(minpos, v);
1798                         maxpos = _mm_max_ps(maxpos, v);
1799                         _mm_store_ps(out4f, v);
1800                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1801                         _mm_store_ps(screen4f, v);
1802                         in4f += 4;
1803                         out4f += 4;
1804                         screen4f += 4;
1805                 }
1806         }
1807         else
1808         {
1809                 minpos = maxpos = _mm_load_ps(in4f);
1810                 while (out4f < end)
1811                 {
1812                         __m128 v = _mm_load_ps(in4f);
1813                         minpos = _mm_min_ps(minpos, v);
1814                         maxpos = _mm_max_ps(maxpos, v);
1815                         _mm_store_ps(out4f, v);
1816                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1817                         _mm_store_ps(screen4f, v);
1818                         in4f += 4;
1819                         out4f += 4;
1820                         screen4f += 4;
1821                 }
1822         }
1823         if (starty && endy) 
1824         {
1825                 ALIGN(float minposf[4]);
1826                 ALIGN(float maxposf[4]);
1827                 _mm_store_ps(minposf, minpos);
1828                 _mm_store_ps(maxposf, maxpos);
1829                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1830         }
1831         return 0;
1832 }
1833
1834 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1835 {
1836         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1837         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1838         float *end;
1839         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1840                 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1841         end = out4f + numitems*4;
1842         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1843         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1844         m0 = _mm_loadu_ps(inmatrix16f);
1845         m1 = _mm_loadu_ps(inmatrix16f + 4);
1846         m2 = _mm_loadu_ps(inmatrix16f + 8);
1847         m3 = _mm_loadu_ps(inmatrix16f + 12);
1848         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1849         {
1850                 minpos = maxpos = _mm_loadu_ps(in4f);
1851                 while (out4f < end)
1852                 {
1853                         __m128 v = _mm_loadu_ps(in4f);
1854                         minpos = _mm_min_ps(minpos, v);
1855                         maxpos = _mm_max_ps(maxpos, v);
1856                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1857                         _mm_store_ps(out4f, v);
1858                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1859                         _mm_store_ps(screen4f, v);
1860                         in4f += 4;
1861                         out4f += 4;
1862                         screen4f += 4;
1863                 }
1864         }
1865         else
1866         {
1867                 minpos = maxpos = _mm_load_ps(in4f);
1868                 while (out4f < end)
1869                 {
1870                         __m128 v = _mm_load_ps(in4f);
1871                         minpos = _mm_min_ps(minpos, v);
1872                         maxpos = _mm_max_ps(maxpos, v);
1873                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1874                         _mm_store_ps(out4f, v);
1875                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1876                         _mm_store_ps(screen4f, v);
1877                         in4f += 4;
1878                         out4f += 4;
1879                         screen4f += 4;
1880                 }
1881         }
1882         if (starty && endy) 
1883         {
1884                 ALIGN(float minposf[4]);
1885                 ALIGN(float maxposf[4]);
1886                 _mm_store_ps(minposf, minpos);
1887                 _mm_store_ps(maxposf, maxpos);
1888                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f); 
1889         }
1890         return 0;
1891 }
1892 #endif
1893
1894 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1895 {
1896 #ifdef SSE_POSSIBLE
1897         float *outf = dpsoftrast.post_array4f[outarray];
1898         const unsigned char *inb;
1899         int firstvertex = dpsoftrast.firstvertex;
1900         int numvertices = dpsoftrast.numvertices;
1901         int stride;
1902         switch(inarray)
1903         {
1904         case DPSOFTRAST_ARRAY_POSITION:
1905                 stride = dpsoftrast.stride_vertex;
1906                 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1907                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1908                 break;
1909         case DPSOFTRAST_ARRAY_COLOR:
1910                 stride = dpsoftrast.stride_color;
1911                 if (dpsoftrast.pointer_color4f)
1912                 {
1913                         inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1914                         DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1915                 }
1916                 else if (dpsoftrast.pointer_color4ub)
1917                 {
1918                         stride = dpsoftrast.stride_color;
1919                         inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1920                         DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1921                 }
1922                 else
1923                 {
1924                         DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1925                 }
1926                 break;
1927         default:
1928                 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1929                 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1930                 {
1931                         inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1932                         switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1933                         {
1934                         case 2:
1935                                 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1936                                 break;
1937                         case 3:
1938                                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1939                                 break;
1940                         case 4:
1941                                 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1942                                 break;
1943                         }
1944                 }
1945                 break;
1946         }
1947         return outf;
1948 #else
1949         return NULL;
1950 #endif
1951 }
1952
1953 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1954 {
1955         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1956         DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1957         return data;
1958 }
1959
1960 #if 0
1961 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1962 {
1963 #ifdef SSE_POSSIBLE
1964         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1965         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1966         return data;
1967 #else
1968         return NULL;
1969 #endif
1970 }
1971 #endif
1972
1973 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1974 {
1975 #ifdef SSE_POSSIBLE
1976         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1977         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
1978         return data;
1979 #else
1980         return NULL;
1981 #endif
1982 }
1983
1984 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1985 {
1986         int x;
1987         int startx = span->startx;
1988         int endx = span->endx;
1989         float wslope = triangle->w[0];
1990         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1991         float endz = 1.0f / (w + wslope * startx);
1992         if (triangle->w[0] == 0)
1993         {
1994                 // LordHavoc: fast flat polygons (HUD/menu)
1995                 for (x = startx;x < endx;x++)
1996                         zf[x] = endz;
1997                 return;
1998         }
1999         for (x = startx;x < endx;)
2000         {
2001                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2002                 float z = endz, dz;
2003                 if (nextsub >= endx) nextsub = endsub = endx-1;
2004                 endz = 1.0f / (w + wslope * nextsub);
2005                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
2006                 for (; x <= endsub; x++, z += dz)
2007                         zf[x] = z;
2008         }
2009 }
2010
2011 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
2012 {
2013         int x;
2014         int startx = span->startx;
2015         int endx = span->endx;
2016         int d[4];
2017         float a, b;
2018         unsigned char * RESTRICT pixelmask = span->pixelmask;
2019         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2020         if (!pixel)
2021                 return;
2022         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2023         // handle alphatest now (this affects depth writes too)
2024         if (thread->alphatest)
2025                 for (x = startx;x < endx;x++)
2026                         if (in4f[x*4+3] < 0.5f)
2027                                 pixelmask[x] = false;
2028         // FIXME: this does not handle bigendian
2029         switch(thread->fb_blendmode)
2030         {
2031         case DPSOFTRAST_BLENDMODE_OPAQUE:
2032                 for (x = startx;x < endx;x++)
2033                 {
2034                         if (!pixelmask[x])
2035                                 continue;
2036                         d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
2037                         d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
2038                         d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
2039                         d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
2040                         pixel[x*4+0] = d[0];
2041                         pixel[x*4+1] = d[1];
2042                         pixel[x*4+2] = d[2];
2043                         pixel[x*4+3] = d[3];
2044                 }
2045                 break;
2046         case DPSOFTRAST_BLENDMODE_ALPHA:
2047                 for (x = startx;x < endx;x++)
2048                 {
2049                         if (!pixelmask[x])
2050                                 continue;
2051                         a = in4f[x*4+3] * 255.0f;
2052                         b = 1.0f - in4f[x*4+3];
2053                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2054                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2055                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2056                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2057                         pixel[x*4+0] = d[0];
2058                         pixel[x*4+1] = d[1];
2059                         pixel[x*4+2] = d[2];
2060                         pixel[x*4+3] = d[3];
2061                 }
2062                 break;
2063         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2064                 for (x = startx;x < endx;x++)
2065                 {
2066                         if (!pixelmask[x])
2067                                 continue;
2068                         a = in4f[x*4+3] * 255.0f;
2069                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2070                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2071                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2072                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2073                         pixel[x*4+0] = d[0];
2074                         pixel[x*4+1] = d[1];
2075                         pixel[x*4+2] = d[2];
2076                         pixel[x*4+3] = d[3];
2077                 }
2078                 break;
2079         case DPSOFTRAST_BLENDMODE_ADD:
2080                 for (x = startx;x < endx;x++)
2081                 {
2082                         if (!pixelmask[x])
2083                                 continue;
2084                         d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2085                         d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2086                         d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2087                         d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2088                         pixel[x*4+0] = d[0];
2089                         pixel[x*4+1] = d[1];
2090                         pixel[x*4+2] = d[2];
2091                         pixel[x*4+3] = d[3];
2092                 }
2093                 break;
2094         case DPSOFTRAST_BLENDMODE_INVMOD:
2095                 for (x = startx;x < endx;x++)
2096                 {
2097                         if (!pixelmask[x])
2098                                 continue;
2099                         d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2100                         d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2101                         d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2102                         d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2103                         pixel[x*4+0] = d[0];
2104                         pixel[x*4+1] = d[1];
2105                         pixel[x*4+2] = d[2];
2106                         pixel[x*4+3] = d[3];
2107                 }
2108                 break;
2109         case DPSOFTRAST_BLENDMODE_MUL:
2110                 for (x = startx;x < endx;x++)
2111                 {
2112                         if (!pixelmask[x])
2113                                 continue;
2114                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2115                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2116                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2117                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2118                         pixel[x*4+0] = d[0];
2119                         pixel[x*4+1] = d[1];
2120                         pixel[x*4+2] = d[2];
2121                         pixel[x*4+3] = d[3];
2122                 }
2123                 break;
2124         case DPSOFTRAST_BLENDMODE_MUL2:
2125                 for (x = startx;x < endx;x++)
2126                 {
2127                         if (!pixelmask[x])
2128                                 continue;
2129                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2130                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2131                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2132                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2133                         pixel[x*4+0] = d[0];
2134                         pixel[x*4+1] = d[1];
2135                         pixel[x*4+2] = d[2];
2136                         pixel[x*4+3] = d[3];
2137                 }
2138                 break;
2139         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2140                 for (x = startx;x < endx;x++)
2141                 {
2142                         if (!pixelmask[x])
2143                                 continue;
2144                         a = in4f[x*4+3] * -255.0f;
2145                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2146                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2147                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2148                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2149                         pixel[x*4+0] = d[0];
2150                         pixel[x*4+1] = d[1];
2151                         pixel[x*4+2] = d[2];
2152                         pixel[x*4+3] = d[3];
2153                 }
2154                 break;
2155         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2156                 for (x = startx;x < endx;x++)
2157                 {
2158                         if (!pixelmask[x])
2159                                 continue;
2160                         a = 255.0f;
2161                         b = 1.0f - in4f[x*4+3];
2162                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2163                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2164                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2165                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2166                         pixel[x*4+0] = d[0];
2167                         pixel[x*4+1] = d[1];
2168                         pixel[x*4+2] = d[2];
2169                         pixel[x*4+3] = d[3];
2170                 }
2171                 break;
2172         case DPSOFTRAST_BLENDMODE_INVADD:
2173                 for (x = startx;x < endx;x++)
2174                 {
2175                         if (!pixelmask[x])
2176                                 continue;
2177                         d[0] = (int)((255.0f-pixel[x*4+2])*in4f[x*4+0] + pixel[x*4+2]);if (d[0] > 255) d[0] = 255;
2178                         d[1] = (int)((255.0f-pixel[x*4+1])*in4f[x*4+1] + pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2179                         d[2] = (int)((255.0f-pixel[x*4+0])*in4f[x*4+2] + pixel[x*4+0]);if (d[2] > 255) d[2] = 255;
2180                         d[3] = (int)((255.0f-pixel[x*4+3])*in4f[x*4+3] + pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2181                         pixel[x*4+0] = d[0];
2182                         pixel[x*4+1] = d[1];
2183                         pixel[x*4+2] = d[2];
2184                         pixel[x*4+3] = d[3];
2185                 }
2186                 break;
2187         }
2188 }
2189
2190 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2191 {
2192 #ifdef SSE_POSSIBLE
2193         int x;
2194         int startx = span->startx;
2195         int endx = span->endx;
2196         int subx;
2197         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2198         unsigned char * RESTRICT pixelmask = span->pixelmask;
2199         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2200         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2201         if (!pixel)
2202                 return;
2203         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2204         pixeli += span->y * dpsoftrast.fb_width + span->x;
2205         // handle alphatest now (this affects depth writes too)
2206         if (thread->alphatest)
2207                 for (x = startx;x < endx;x++)
2208                         if (in4ub[x*4+3] < 128)
2209                                 pixelmask[x] = false;
2210         // LordHavoc: clear pixelmask for some pixels in alphablend cases, this
2211         // helps sprites, text and hud artwork
2212         switch(thread->fb_blendmode)
2213         {
2214         case DPSOFTRAST_BLENDMODE_ALPHA:
2215         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2216         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2217                 for (x = startx;x < endx;x++)
2218                         if (in4ub[x*4+3] < 1)
2219                                 pixelmask[x] = false;
2220                 break;
2221         case DPSOFTRAST_BLENDMODE_OPAQUE:
2222         case DPSOFTRAST_BLENDMODE_ADD:
2223         case DPSOFTRAST_BLENDMODE_INVMOD:
2224         case DPSOFTRAST_BLENDMODE_MUL:
2225         case DPSOFTRAST_BLENDMODE_MUL2:
2226         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2227         case DPSOFTRAST_BLENDMODE_INVADD:
2228                 break;
2229         }
2230         // put some special values at the end of the mask to ensure the loops end
2231         pixelmask[endx] = 1;
2232         pixelmask[endx+1] = 0;
2233         // LordHavoc: use a double loop to identify subspans, this helps the
2234         // optimized copy/blend loops to perform at their best, most triangles
2235         // have only one run of pixels, and do the search using wide reads...
2236         x = startx;
2237         while (x < endx)
2238         {
2239                 // if this pixel is masked off, it's probably not alone...
2240                 if (!pixelmask[x])
2241                 {
2242                         x++;
2243 #if 1
2244                         if (x + 8 < endx)
2245                         {
2246                                 // the 4-item search must be aligned or else it stalls badly
2247                                 if ((x & 3) && !pixelmask[x]) x++;
2248                                 if ((x & 3) && !pixelmask[x]) x++;
2249                                 if ((x & 3) && !pixelmask[x]) x++;
2250                                 while (*((unsigned int *)pixelmask + x) == 0x00000000)
2251                                         x += 4;
2252                         }
2253 #endif
2254                         for (;!pixelmask[x];x++)
2255                                 ;
2256                         // rather than continue the loop, just check the end variable
2257                         if (x >= endx)
2258                                 break;
2259                 }
2260                 // find length of subspan
2261                 subx = x + 1;
2262 #if 1
2263                 if (x + 8 < endx)
2264                 {
2265                         if ((subx & 3) && pixelmask[subx]) subx++;
2266                         if ((subx & 3) && pixelmask[subx]) subx++;
2267                         if ((subx & 3) && pixelmask[subx]) subx++;
2268                         while (*((unsigned int *)pixelmask + subx) == 0x01010101)
2269                                 subx += 4;
2270                 }
2271 #endif
2272                 for (;pixelmask[subx];subx++)
2273                         ;
2274                 // the checks can overshoot, so make sure to clip it...
2275                 if (subx > endx)
2276                         subx = endx;
2277                 // now that we know the subspan length...  process!
2278                 switch(thread->fb_blendmode)
2279                 {
2280                 case DPSOFTRAST_BLENDMODE_OPAQUE:
2281 #if 0
2282                         if (subx - x >= 16)
2283                         {
2284                                 memcpy(pixeli + x, ini + x, (subx - x) * sizeof(pixeli[x]));
2285                                 x = subx;
2286                         }
2287                         else
2288 #elif 1
2289                         while (x + 16 <= subx)
2290                         {
2291                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2292                                 _mm_storeu_si128((__m128i *)&pixeli[x+4], _mm_loadu_si128((const __m128i *)&ini[x+4]));
2293                                 _mm_storeu_si128((__m128i *)&pixeli[x+8], _mm_loadu_si128((const __m128i *)&ini[x+8]));
2294                                 _mm_storeu_si128((__m128i *)&pixeli[x+12], _mm_loadu_si128((const __m128i *)&ini[x+12]));
2295                                 x += 16;
2296                         }
2297 #endif
2298                         {
2299                                 while (x + 4 <= subx)
2300                                 {
2301                                         _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2302                                         x += 4;
2303                                 }
2304                                 if (x + 2 <= subx)
2305                                 {
2306                                         pixeli[x] = ini[x];
2307                                         pixeli[x+1] = ini[x+1];
2308                                         x += 2;
2309                                 }
2310                                 if (x < subx)
2311                                 {
2312                                         pixeli[x] = ini[x];
2313                                         x++;
2314                                 }
2315                         }
2316                         break;
2317                 case DPSOFTRAST_BLENDMODE_ALPHA:
2318                 #define FINISHBLEND(blend2, blend1) \
2319                         for (;x + 1 < subx;x += 2) \
2320                         { \
2321                                 __m128i src, dst; \
2322                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2323                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2324                                 blend2; \
2325                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2326                         } \
2327                         if (x < subx) \
2328                         { \
2329                                 __m128i src, dst; \
2330                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2331                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2332                                 blend1; \
2333                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2334                                 x++; \
2335                         }
2336                         FINISHBLEND({
2337                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2338                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2339                         }, {
2340                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2341                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2342                         });
2343                         break;
2344                 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2345                         FINISHBLEND({
2346                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2347                                 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2348                         }, {
2349                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2350                                 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2351                         });
2352                         break;
2353                 case DPSOFTRAST_BLENDMODE_ADD:
2354                         FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2355                         break;
2356                 case DPSOFTRAST_BLENDMODE_INVMOD:
2357                         FINISHBLEND({
2358                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2359                         }, {
2360                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2361                         });
2362                         break;
2363                 case DPSOFTRAST_BLENDMODE_MUL:
2364                         FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2365                         break;
2366                 case DPSOFTRAST_BLENDMODE_MUL2:
2367                         FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2368                         break;
2369                 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2370                         FINISHBLEND({
2371                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2372                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2373                         }, {
2374                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2375                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2376                         });
2377                         break;
2378                 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2379                         FINISHBLEND({
2380                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2381                                 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2382                         }, {
2383                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2384                                 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2385                         });
2386                         break;
2387                 case DPSOFTRAST_BLENDMODE_INVADD:
2388                         FINISHBLEND({
2389                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2390                         }, {
2391                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2392                         });
2393                         break;
2394                 }
2395         }
2396 #endif
2397 }
2398
2399 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2400 {
2401         int x;
2402         int startx = span->startx;
2403         int endx = span->endx;
2404         int flags;
2405         float c[4];
2406         float data[4];
2407         float slope[4];
2408         float tc[2], endtc[2];
2409         float tcscale[2];
2410         unsigned int tci[2];
2411         unsigned int tci1[2];
2412         unsigned int tcimin[2];
2413         unsigned int tcimax[2];
2414         int tciwrapmask[2];
2415         int tciwidth;
2416         int filter;
2417         int mip;
2418         const unsigned char * RESTRICT pixelbase;
2419         const unsigned char * RESTRICT pixel[4];
2420         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2421         // if no texture is bound, just fill it with white
2422         if (!texture)
2423         {
2424                 for (x = startx;x < endx;x++)
2425                 {
2426                         out4f[x*4+0] = 1.0f;
2427                         out4f[x*4+1] = 1.0f;
2428                         out4f[x*4+2] = 1.0f;
2429                         out4f[x*4+3] = 1.0f;
2430                 }
2431                 return;
2432         }
2433         mip = triangle->mip[texunitindex];
2434         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2435         // if this mipmap of the texture is 1 pixel, just fill it with that color
2436         if (texture->mipmap[mip][1] == 4)
2437         {
2438                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2439                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2440                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2441                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2442                 for (x = startx;x < endx;x++)
2443                 {
2444                         out4f[x*4+0] = c[0];
2445                         out4f[x*4+1] = c[1];
2446                         out4f[x*4+2] = c[2];
2447                         out4f[x*4+3] = c[3];
2448                 }
2449                 return;
2450         }
2451         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2452         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2453         flags = texture->flags;
2454         tcscale[0] = texture->mipmap[mip][2];
2455         tcscale[1] = texture->mipmap[mip][3];
2456         tciwidth = texture->mipmap[mip][2];
2457         tcimin[0] = 0;
2458         tcimin[1] = 0;
2459         tcimax[0] = texture->mipmap[mip][2]-1;
2460         tcimax[1] = texture->mipmap[mip][3]-1;
2461         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2462         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2463         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
2464         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
2465         if (filter)
2466         {
2467                 endtc[0] -= 0.5f;
2468                 endtc[1] -= 0.5f;
2469         }
2470         for (x = startx;x < endx;)
2471         {
2472                 unsigned int subtc[2];
2473                 unsigned int substep[2];
2474                 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2475                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2476                 if (nextsub >= endx)
2477                 {
2478                         nextsub = endsub = endx-1;      
2479                         if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2480                 }
2481                 tc[0] = endtc[0];
2482                 tc[1] = endtc[1];
2483                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2484                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2485                 if (filter)
2486                 {
2487                         endtc[0] -= 0.5f;
2488                         endtc[1] -= 0.5f;
2489                 }
2490                 substep[0] = (endtc[0] - tc[0]) * subscale;
2491                 substep[1] = (endtc[1] - tc[1]) * subscale;
2492                 subtc[0] = tc[0] * (1<<12);
2493                 subtc[1] = tc[1] * (1<<12);
2494                 if (filter)
2495                 {
2496                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2497                         {
2498                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2499                                 {
2500                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2501                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2502                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2503                                         tci[0] = subtc[0]>>12;
2504                                         tci[1] = subtc[1]>>12;
2505                                         tci1[0] = tci[0] + 1;
2506                                         tci1[1] = tci[1] + 1;
2507                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2508                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2509                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2510                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2511                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2512                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2513                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2514                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2515                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2516                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2517                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2518                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2519                                         out4f[x*4+0] = c[0];
2520                                         out4f[x*4+1] = c[1];
2521                                         out4f[x*4+2] = c[2];
2522                                         out4f[x*4+3] = c[3];
2523                                 }
2524                         }
2525                         else
2526                         {
2527                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2528                                 {
2529                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2530                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2531                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2532                                         tci[0] = subtc[0]>>12;
2533                                         tci[1] = subtc[1]>>12;
2534                                         tci1[0] = tci[0] + 1;
2535                                         tci1[1] = tci[1] + 1;
2536                                         tci[0] &= tciwrapmask[0];
2537                                         tci[1] &= tciwrapmask[1];
2538                                         tci1[0] &= tciwrapmask[0];
2539                                         tci1[1] &= tciwrapmask[1];
2540                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2541                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2542                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2543                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2544                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2545                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2546                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2547                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2548                                         out4f[x*4+0] = c[0];
2549                                         out4f[x*4+1] = c[1];
2550                                         out4f[x*4+2] = c[2];
2551                                         out4f[x*4+3] = c[3];
2552                                 }
2553                         }
2554                 }
2555                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2556                 {
2557                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2558                         {
2559                                 tci[0] = subtc[0]>>12;
2560                                 tci[1] = subtc[1]>>12;
2561                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2562                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2563                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2564                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2565                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2566                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2567                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2568                                 out4f[x*4+0] = c[0];
2569                                 out4f[x*4+1] = c[1];
2570                                 out4f[x*4+2] = c[2];
2571                                 out4f[x*4+3] = c[3];
2572                         }
2573                 }
2574                 else
2575                 {
2576                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2577                         {
2578                                 tci[0] = subtc[0]>>12;
2579                                 tci[1] = subtc[1]>>12;
2580                                 tci[0] &= tciwrapmask[0];
2581                                 tci[1] &= tciwrapmask[1];
2582                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2583                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2584                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2585                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2586                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2587                                 out4f[x*4+0] = c[0];
2588                                 out4f[x*4+1] = c[1];
2589                                 out4f[x*4+2] = c[2];
2590                                 out4f[x*4+3] = c[3];
2591                         }
2592                 }
2593         }
2594 }
2595
2596 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2597 {
2598 #ifdef SSE_POSSIBLE
2599         int x;
2600         int startx = span->startx;
2601         int endx = span->endx;
2602         int flags;
2603         __m128 data, slope, tcscale;
2604         __m128i tcsize, tcmask, tcoffset, tcmax;
2605         __m128 tc, endtc;
2606         __m128i subtc, substep, endsubtc;
2607         int filter;
2608         int mip;
2609         unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2610         const unsigned char * RESTRICT pixelbase;
2611         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2612         // if no texture is bound, just fill it with white
2613         if (!texture)
2614         {
2615                 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2616                 return;
2617         }
2618         mip = triangle->mip[texunitindex];
2619         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2620         // if this mipmap of the texture is 1 pixel, just fill it with that color
2621         if (texture->mipmap[mip][1] == 4)
2622         {
2623                 unsigned int k = *((const unsigned int *)pixelbase);
2624                 for (x = startx;x < endx;x++)
2625                         outi[x] = k;
2626                 return;
2627         }
2628         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2629         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2630         flags = texture->flags;
2631         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2632         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2633         tcscale = _mm_cvtepi32_ps(tcsize);
2634         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2635         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2636         endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2637         if (filter)
2638                 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2639         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2640         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2641         tcmax = _mm_packs_epi32(tcmask, tcmask);
2642         for (x = startx;x < endx;)
2643         {
2644                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2645                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2646                 if (nextsub >= endx)
2647                 {
2648                         nextsub = endsub = endx-1;
2649                         if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2650                 }       
2651                 tc = endtc;
2652                 subtc = endsubtc;
2653                 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2654                 if (filter)
2655                         endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2656                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2657                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2658                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2659                 substep = _mm_slli_epi32(substep, 1);
2660                 if (filter)
2661                 {
2662                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2663                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2664                         {
2665                                 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2666                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2667                                 {
2668                                         const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;                   
2669                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2670                                         tci = _mm_madd_epi16(tci, tcoffset);
2671                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2672                                         ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2673                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2674                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2675                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2676                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2677                                         fracm = _mm_srli_epi16(subtc, 1);
2678                                         pix1 = _mm_add_epi16(pix1,
2679                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2680                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2681                                         pix3 = _mm_add_epi16(pix3,
2682                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2683                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2684                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2685                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2686                                         pix2 = _mm_add_epi16(pix2,
2687                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2688                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2689                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2690                                 }
2691                                 if (x <= endsub)
2692                                 {
2693                                         const unsigned char * RESTRICT ptr1;
2694                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2695                                         tci = _mm_madd_epi16(tci, tcoffset);
2696                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2697                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2698                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2699                                         fracm = _mm_srli_epi16(subtc, 1);
2700                                         pix1 = _mm_add_epi16(pix1,
2701                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2702                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2703                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2704                                         pix1 = _mm_add_epi16(pix1,
2705                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2706                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2707                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2708                                         x++;
2709                                 }
2710                         }
2711                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2712                         {
2713                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2714                                 {
2715                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2716                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2717                                         tci = _mm_madd_epi16(tci, tcoffset);
2718                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2719                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2720                                                                                         _mm_setzero_si128());
2721                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2722                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2723                                                                                         _mm_setzero_si128());
2724                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2725                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2726                                         tci = _mm_madd_epi16(tci, tcoffset);
2727                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2728                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2729                                                                                         _mm_setzero_si128());
2730                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2731                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2732                                                                                         _mm_setzero_si128());
2733                                         fracm = _mm_srli_epi16(subtc, 1);
2734                                         pix1 = _mm_add_epi16(pix1,
2735                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2736                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2737                                         pix3 = _mm_add_epi16(pix3,
2738                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2739                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2740                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2741                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2742                                         pix2 = _mm_add_epi16(pix2,
2743                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2744                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2745                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2746                                 }
2747                                 if (x <= endsub)
2748                                 {
2749                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2750                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2751                                         tci = _mm_madd_epi16(tci, tcoffset);
2752                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2753                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2754                                                                                         _mm_setzero_si128());
2755                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2756                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2757                                                                                         _mm_setzero_si128());
2758                                         fracm = _mm_srli_epi16(subtc, 1);
2759                                         pix1 = _mm_add_epi16(pix1,
2760                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2761                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2762                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2763                                         pix1 = _mm_add_epi16(pix1,
2764                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2765                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2766                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2767                                         x++;
2768                                 }
2769                         }
2770                         else
2771                         {
2772                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2773                                 {
2774                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2775                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2776                                         tci = _mm_madd_epi16(tci, tcoffset);
2777                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2778                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2779                                                                                         _mm_setzero_si128());
2780                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2781                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2782                                                                                         _mm_setzero_si128());
2783                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2784                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2785                                         tci = _mm_madd_epi16(tci, tcoffset);
2786                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2787                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2788                                                                                         _mm_setzero_si128());
2789                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2790                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2791                                                                                         _mm_setzero_si128());
2792                                         fracm = _mm_srli_epi16(subtc, 1);
2793                                         pix1 = _mm_add_epi16(pix1,
2794                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2795                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2796                                         pix3 = _mm_add_epi16(pix3,
2797                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2798                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2799                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2800                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2801                                         pix2 = _mm_add_epi16(pix2,
2802                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2803                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2804                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2805                                 }
2806                                 if (x <= endsub)
2807                                 {
2808                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2809                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2810                                         tci = _mm_madd_epi16(tci, tcoffset);
2811                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2812                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2813                                                                                         _mm_setzero_si128());
2814                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2815                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2816                                                                                         _mm_setzero_si128());
2817                                         fracm = _mm_srli_epi16(subtc, 1);
2818                                         pix1 = _mm_add_epi16(pix1,
2819                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2820                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2821                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2822                                         pix1 = _mm_add_epi16(pix1,
2823                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2824                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2825                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2826                                         x++;
2827                                 }
2828                         }
2829                 }
2830                 else
2831                 {
2832                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2833                         {
2834                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2835                                 {
2836                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2837                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2838                                         tci = _mm_madd_epi16(tci, tcoffset);
2839                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2840                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2841                                 }
2842                                 if (x <= endsub)
2843                                 {
2844                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2845                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2846                                         tci = _mm_madd_epi16(tci, tcoffset);
2847                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2848                                         x++;
2849                                 }
2850                         }
2851                         else
2852                         {
2853                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2854                                 {
2855                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2856                                         tci = _mm_and_si128(tci, tcmax); 
2857                                         tci = _mm_madd_epi16(tci, tcoffset);
2858                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2859                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2860                                 }
2861                                 if (x <= endsub)
2862                                 {
2863                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2864                                         tci = _mm_and_si128(tci, tcmax); 
2865                                         tci = _mm_madd_epi16(tci, tcoffset);
2866                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2867                                         x++;
2868                                 }
2869                         }
2870                 }
2871         }
2872 #endif
2873 }
2874
2875 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2876 {
2877         // TODO: IMPLEMENT
2878         memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2879 }
2880
2881 float DPSOFTRAST_SampleShadowmap(const float *vector)
2882 {
2883         // TODO: IMPLEMENT
2884         return 1.0f;
2885 }
2886
2887 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2888 {
2889         int x;
2890         int startx = span->startx;
2891         int endx = span->endx;
2892         float c[4];
2893         float data[4];
2894         float slope[4];
2895         float z;
2896         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2897         for (x = startx;x < endx;x++)
2898         {
2899                 z = zf[x];
2900                 c[0] = (data[0] + slope[0]*x) * z;
2901                 c[1] = (data[1] + slope[1]*x) * z;
2902                 c[2] = (data[2] + slope[2]*x) * z;
2903                 c[3] = (data[3] + slope[3]*x) * z;
2904                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2905                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2906                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2907                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2908         }
2909 }
2910
2911 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2912 {
2913         int x;
2914         int startx = span->startx;
2915         int endx = span->endx;
2916         float c[4];
2917         float data[4];
2918         float slope[4];
2919         float z;
2920         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2921         for (x = startx;x < endx;x++)
2922         {
2923                 z = zf[x];
2924                 c[0] = (data[0] + slope[0]*x) * z;
2925                 c[1] = (data[1] + slope[1]*x) * z;
2926                 c[2] = (data[2] + slope[2]*x) * z;
2927                 c[3] = (data[3] + slope[3]*x) * z;
2928                 out4f[x*4+0] = c[0];
2929                 out4f[x*4+1] = c[1];
2930                 out4f[x*4+2] = c[2];
2931                 out4f[x*4+3] = c[3];
2932         }
2933 }
2934
2935 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2936 {
2937         int x, startx = span->startx, endx = span->endx;
2938         float c[4], localcolor[4];
2939         localcolor[0] = subcolor[0];
2940         localcolor[1] = subcolor[1];
2941         localcolor[2] = subcolor[2];
2942         localcolor[3] = subcolor[3];
2943         for (x = startx;x < endx;x++)
2944         {
2945                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2946                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2947                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2948                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2949                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2950                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2951                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2952                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2953         }
2954 }
2955
2956 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2957 {
2958         int x, startx = span->startx, endx = span->endx;
2959         for (x = startx;x < endx;x++)
2960         {
2961                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2962                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2963                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2964                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2965         }
2966 }
2967
2968 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2969 {
2970         int x, startx = span->startx, endx = span->endx;
2971         for (x = startx;x < endx;x++)
2972         {
2973                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2974                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2975                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2976                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2977         }
2978 }
2979
2980 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2981 {
2982         int x, startx = span->startx, endx = span->endx;
2983         float a, b;
2984         for (x = startx;x < endx;x++)
2985         {
2986                 a = 1.0f - inb4f[x*4+3];
2987                 b = inb4f[x*4+3];
2988                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2989                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2990                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2991                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2992         }
2993 }
2994
2995 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2996 {
2997         int x, startx = span->startx, endx = span->endx;
2998         float localcolor[4], ilerp, lerp;
2999         localcolor[0] = color[0];
3000         localcolor[1] = color[1];
3001         localcolor[2] = color[2];
3002         localcolor[3] = color[3];
3003         ilerp = 1.0f - localcolor[3];
3004         lerp = localcolor[3];
3005         for (x = startx;x < endx;x++)
3006         {
3007                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
3008                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
3009                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
3010                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
3011         }
3012 }
3013
3014
3015
3016 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
3017 {
3018 #ifdef SSE_POSSIBLE
3019         int x;
3020         int startx = span->startx;
3021         int endx = span->endx;
3022         __m128 data, slope;
3023         __m128 mod, endmod;
3024         __m128i submod, substep, endsubmod;
3025         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3026         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3027         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3028         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3029         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3030         for (x = startx; x < endx;)
3031         {
3032                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3033                 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3034                 if (nextsub >= endx)
3035                 {
3036                         nextsub = endsub = endx-1;
3037                         if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
3038                 }
3039                 mod = endmod;
3040                 submod = endsubmod;
3041                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3042                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3043                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3044                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3045                 substep = _mm_packs_epi32(substep, substep);
3046                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3047                 {
3048                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
3049                         pix = _mm_mulhi_epu16(pix, submod);
3050                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3051                 }
3052                 if (x <= endsub)
3053                 {
3054                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
3055                         pix = _mm_mulhi_epu16(pix, submod);
3056                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3057                         x++;
3058                 }
3059         }
3060 #endif
3061 }
3062
3063 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
3064 {
3065 #ifdef SSE_POSSIBLE
3066         int x;
3067         int startx = span->startx;
3068         int endx = span->endx;
3069         __m128 data, slope;
3070         __m128 mod, endmod;
3071         __m128i submod, substep, endsubmod;
3072         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3073         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3074         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3075         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3076         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3077         for (x = startx; x < endx;)
3078         {
3079                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3080                 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3081                 if (nextsub >= endx)
3082                 {
3083                         nextsub = endsub = endx-1;
3084                         if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
3085                 }
3086                 mod = endmod;
3087                 submod = endsubmod;
3088                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3089                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3090                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3091                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3092                 substep = _mm_packs_epi32(substep, substep);
3093                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3094                 {
3095                         __m128i pix = _mm_srai_epi16(submod, 4);
3096                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3097                 }
3098                 if (x <= endsub)
3099                 {
3100                         __m128i pix = _mm_srai_epi16(submod, 4);
3101                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3102                         x++;
3103                 }
3104         }
3105 #endif
3106 }
3107
3108 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3109 {
3110 #ifdef SSE_POSSIBLE
3111         int x, startx = span->startx, endx = span->endx;
3112         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3113         localcolor = _mm_packs_epi32(localcolor, localcolor);
3114         for (x = startx;x+2 <= endx;x+=2)
3115         {
3116                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3117                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3118                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3119                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3120         }
3121         if (x < endx)
3122         {
3123                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3124                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3125                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3126                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3127         }
3128 #endif
3129 }
3130
3131 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3132 {
3133 #ifdef SSE_POSSIBLE
3134         int x, startx = span->startx, endx = span->endx;
3135         for (x = startx;x+2 <= endx;x+=2)
3136         {
3137                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3138                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3139                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3140                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3141         }
3142         if (x < endx)
3143         {
3144                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3145                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3146                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3147                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3148         }
3149 #endif
3150 }
3151
3152 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3153 {
3154 #ifdef SSE_POSSIBLE
3155         int x, startx = span->startx, endx = span->endx;
3156         for (x = startx;x+2 <= endx;x+=2)
3157         {
3158                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3159                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3160                 pix1 = _mm_add_epi16(pix1, pix2);
3161                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3162         }
3163         if (x < endx)
3164         {
3165                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3166                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3167                 pix1 = _mm_add_epi16(pix1, pix2);
3168                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3169         }
3170 #endif
3171 }
3172
3173 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3174 {
3175 #ifdef SSE_POSSIBLE
3176         int x, startx = span->startx, endx = span->endx;
3177         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3178         tint = _mm_packs_epi32(tint, tint);
3179         for (x = startx;x+2 <= endx;x+=2)
3180         {
3181                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3182                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3183                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3184                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3185         }
3186         if (x < endx)
3187         {
3188                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3189                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3190                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3191                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3192         }
3193 #endif
3194 }
3195
3196 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3197 {
3198 #ifdef SSE_POSSIBLE
3199         int x, startx = span->startx, endx = span->endx;
3200         for (x = startx;x+2 <= endx;x+=2)
3201         {
3202                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3203                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3204                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3205                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3206                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3207         }
3208         if (x < endx)
3209         {
3210                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3211                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3212                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3213                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3214                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3215         }
3216 #endif
3217 }
3218
3219 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3220 {
3221 #ifdef SSE_POSSIBLE
3222         int x, startx = span->startx, endx = span->endx;
3223         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3224         localcolor = _mm_packs_epi32(localcolor, localcolor);
3225         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3226         for (x = startx;x+2 <= endx;x+=2)
3227         {
3228                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3229                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3230                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3231         }
3232         if (x < endx)
3233         {
3234                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3235                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3236                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3237         }
3238 #endif
3239 }
3240
3241
3242
3243 void DPSOFTRAST_VertexShader_Generic(void)
3244 {
3245         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3246         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3247         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3248         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3249                 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3250 }
3251
3252 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3253 {
3254         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3255         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3256         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3257         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3258         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3259         if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3260         {
3261                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3262                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3263                 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3264                 {
3265                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3266                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3267                         {
3268                                 // multiply
3269                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3270                         }
3271                         else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3272                         {
3273                                 // add
3274                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3275                         }
3276                         else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3277                         {
3278                                 // alphablend
3279                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3280                         }
3281                 }
3282         }
3283         else
3284                 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3285         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3286 }
3287
3288
3289
3290 void DPSOFTRAST_VertexShader_PostProcess(void)
3291 {
3292         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3293         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3294         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3295 }
3296
3297 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3298 {
3299         // TODO: optimize!!  at the very least there is no reason to use texture sampling on the frame texture
3300         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3301         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3302         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3303         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3304         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3305         if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3306         {
3307                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3308                 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3309         }
3310         DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3311         if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3312         {
3313                 // TODO: implement saturation
3314         }
3315         if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3316         {
3317                 // TODO: implement gammaramps
3318         }
3319         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3320 }
3321
3322
3323
3324 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3325 {
3326         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3327 }
3328
3329 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3330 {
3331         // this is never called (because colormask is off when this shader is used)
3332         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3333         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3334         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3335         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3336         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3337 }
3338
3339
3340
3341 void DPSOFTRAST_VertexShader_FlatColor(void)
3342 {
3343         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3344         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3345 }
3346
3347 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3348 {
3349 #ifdef SSE_POSSIBLE
3350         unsigned char * RESTRICT pixelmask = span->pixelmask;
3351         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3352         int x, startx = span->startx, endx = span->endx;
3353         __m128i Color_Ambientm;
3354         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3355         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3356         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3357         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3358         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3359         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3360                 pixel = buffer_FragColorbgra8;
3361         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3362         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3363         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3364         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3365         for (x = startx;x < endx;x++)
3366         {
3367                 __m128i color, pix;
3368                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3369                 {
3370                         __m128i pix2;
3371                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3372                         pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3373                         pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3374                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3375                         x += 3;
3376                         continue;
3377                 }
3378                 if (!pixelmask[x])
3379                         continue;
3380                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3381                 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3382                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3383         }
3384         if (pixel == buffer_FragColorbgra8)
3385                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3386 #endif
3387 }
3388
3389
3390
3391 void DPSOFTRAST_VertexShader_VertexColor(void)
3392 {
3393         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3394         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3395         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3396 }
3397
3398 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3399 {
3400 #ifdef SSE_POSSIBLE
3401         unsigned char * RESTRICT pixelmask = span->pixelmask;
3402         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3403         int x, startx = span->startx, endx = span->endx;
3404         __m128i Color_Ambientm, Color_Diffusem;
3405         __m128 data, slope;
3406         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3407         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3408         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3409         int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3410         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3411         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3412         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3413                 pixel = buffer_FragColorbgra8;
3414         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3415         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3416         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3417         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3418         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3419         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3420         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3421         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3422         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3423         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3424         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3425         data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3426         slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3427         for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3428         {
3429                 __m128i color, mod, pix;
3430                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3431                 {
3432                         __m128i pix2, mod2;
3433                         __m128 z = _mm_loadu_ps(&buffer_z[x]);
3434                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3435                         mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3436                         data = _mm_add_ps(data, slope);
3437                         mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3438                         data = _mm_add_ps(data, slope);
3439                         mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3440                         data = _mm_add_ps(data, slope);
3441                         mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3442                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3443                                                                   _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3444                         pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3445                                                                    _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3446                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3447                         x += 3;
3448                         continue;
3449                 }
3450                 if (!pixelmask[x])
3451                         continue;
3452                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3453                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
3454                 mod = _mm_packs_epi32(mod, mod);
3455                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3456                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3457         }
3458         if (pixel == buffer_FragColorbgra8)
3459                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3460 #endif
3461 }
3462
3463
3464
3465 void DPSOFTRAST_VertexShader_Lightmap(void)
3466 {
3467         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3468         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3469         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3470 }
3471
3472 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3473 {
3474 #ifdef SSE_POSSIBLE
3475         unsigned char * RESTRICT pixelmask = span->pixelmask;
3476         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3477         int x, startx = span->startx, endx = span->endx;
3478         __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3479         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3480         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3481         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3482         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3483         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3484         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3485         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3486         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3487         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3488                 pixel = buffer_FragColorbgra8;
3489         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3490         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3491         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3492         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3493         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3494         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3495         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3496         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3497         {
3498                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3499                 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3500                 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3501                 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3502                 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3503                 for (x = startx;x < endx;x++)
3504                 {
3505                         __m128i color, lightmap, glow, pix;
3506                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3507                         {
3508                                 __m128i pix2;
3509                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3510                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3511                                 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3512                                 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3513                                                                                                         _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3514                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3515                                 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3516                                                                                                         _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3517                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3518                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3519                                 x += 3;
3520                                 continue;
3521                         }
3522                         if (!pixelmask[x])
3523                                 continue;
3524                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3525                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3526                         glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3527                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3528                         pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3529                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3530                 }
3531         }
3532         else
3533         {
3534                 for (x = startx;x < endx;x++)
3535                 {
3536                         __m128i color, lightmap, pix;
3537                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3538                         {
3539                                 __m128i pix2;
3540                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3541                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3542                                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3543                                                                           _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3544                                 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3545                                                                            _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3546                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3547                                 x += 3;
3548                                 continue;
3549                         }
3550                         if (!pixelmask[x]) 
3551                                 continue;
3552                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3553                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3554                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3555                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3556                 }
3557         }
3558         if (pixel == buffer_FragColorbgra8)
3559                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3560 #endif
3561 }
3562
3563
3564 void DPSOFTRAST_VertexShader_LightDirection(void);
3565 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3566
3567 void DPSOFTRAST_VertexShader_FakeLight(void)
3568 {
3569         DPSOFTRAST_VertexShader_LightDirection();
3570 }
3571
3572 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3573 {
3574         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3575 }
3576
3577
3578
3579 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3580 {
3581         DPSOFTRAST_VertexShader_LightDirection();
3582         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3583 }
3584
3585 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3586 {
3587         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3588 }
3589
3590
3591
3592 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3593 {
3594         DPSOFTRAST_VertexShader_LightDirection();
3595         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3596 }
3597
3598 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3599 {
3600         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3601 }
3602
3603
3604
3605 void DPSOFTRAST_VertexShader_LightDirection(void)
3606 {
3607         int i;
3608         int numvertices = dpsoftrast.numvertices;
3609         float LightDir[4];
3610         float LightVector[4];
3611         float EyePosition[4];
3612         float EyeVectorModelSpace[4];
3613         float EyeVector[4];
3614         float position[4];
3615         float svector[4];
3616         float tvector[4];
3617         float normal[4];
3618         LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3619         LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3620         LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3621         LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3622         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3623         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3624         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3625         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3626         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3627         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3628         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3629         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3630         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3631         for (i = 0;i < numvertices;i++)
3632         {
3633                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3634                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3635                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3636                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3637                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3638                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3639                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3640                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3641                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3642                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3643                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3644                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3645                 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3646                 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3647                 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3648                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3649                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3650                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3651                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3652                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3653                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3654                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3655                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3656                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3657                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3658                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3659                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3660                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3661                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3662         }
3663         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3664 }
3665
3666 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3667 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3668 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3669 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3670 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3671 #define DPSOFTRAST_Vector3Normalize(v)\
3672 do\
3673 {\
3674         float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3675         if (len)\
3676         {\
3677                 len = 1.0f / len;\
3678                 v[0] *= len;\
3679                 v[1] *= len;\
3680                 v[2] *= len;\
3681         }\
3682 }\
3683 while(0)
3684
3685 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3686 {
3687         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3688         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3689         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3690         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3691         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3692         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3693         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3694         unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3695         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3696         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3697         int x, startx = span->startx, endx = span->endx;
3698         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3699         float LightVectordata[4];
3700         float LightVectorslope[4];
3701         float EyeVectordata[4];
3702         float EyeVectorslope[4];
3703         float VectorSdata[4];
3704         float VectorSslope[4];
3705         float VectorTdata[4];
3706         float VectorTslope[4];
3707         float VectorRdata[4];
3708         float VectorRslope[4];
3709         float z;
3710         float diffusetex[4];
3711         float glosstex[4];
3712         float surfacenormal[4];
3713         float lightnormal[4];
3714         float lightnormal_modelspace[4];
3715         float eyenormal[4];
3716         float specularnormal[4];
3717         float diffuse;
3718         float specular;
3719         float SpecularPower;
3720         int d[4];
3721         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3722         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3723         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3724         Color_Glow[3] = 0.0f;
3725         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3726         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3727         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3728         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3729         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3730         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3731         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3732         Color_Pants[3] = 0.0f;
3733         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3734         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3735         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3736         Color_Shirt[3] = 0.0f;
3737         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3738         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3739         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3740         {
3741                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3742                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3743         }
3744         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3745         {
3746                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3747         }
3748         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3749         {
3750                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3751                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3752                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3753                 Color_Diffuse[3] = 0.0f;
3754                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3755                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3756                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3757                 LightColor[3] = 0.0f;
3758                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3759                 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3760                 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3761                 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3762                 Color_Specular[3] = 0.0f;
3763                 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3764                 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3765                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3766
3767                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3768                 {
3769                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3770                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3771                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3772                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3773                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3774                 }
3775                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3776                 {
3777                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3778                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3779                 }
3780                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3781                 {
3782                         // nothing of this needed
3783                 }
3784                 else
3785                 {
3786                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3787                 }
3788
3789                 for (x = startx;x < endx;x++)
3790                 {
3791                         z = buffer_z[x];
3792                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3793                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3794                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3795                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3796                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3797                         {
3798                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3799                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3800                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3801                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3802                         }
3803                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3804                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3805                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3806                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3807                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3808                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3809                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3810                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3811
3812                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3813                         {
3814                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3815                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3816                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3817                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3818
3819                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3820                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3821                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3822                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3823
3824                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3825                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3826                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3827                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3828
3829                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3830                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3831                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3832                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3833
3834                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3835                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3836
3837                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3838                                 {
3839                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3840                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3841                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3842                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3843                                 }
3844                         }
3845                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3846                         {
3847                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3848                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3849                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3850                                 {
3851                                         float f = 1.0f / 256.0f;
3852                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3853                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3854                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3855                                 }
3856                         }
3857                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3858                         {
3859                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3860                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3861                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3862                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3863
3864                                 LightColor[0] = 1.0;
3865                                 LightColor[1] = 1.0;
3866                                 LightColor[2] = 1.0;
3867                         }
3868                         else
3869                         {
3870                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3871                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3872                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3873                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3874                         }
3875
3876                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3877
3878                         if(thread->shader_exactspecularmath)
3879                         {
3880                                 // reflect lightnormal at surfacenormal, take the negative of that
3881                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3882                                 float f;
3883                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3884                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3885                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3886                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3887
3888                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
3889                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3890                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3891                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3892                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3893
3894                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3895                         }
3896                         else
3897                         {
3898                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3899                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3900                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3901                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3902
3903                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
3904                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
3905                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
3906                                 DPSOFTRAST_Vector3Normalize(specularnormal);
3907
3908                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3909                         }
3910
3911                         specular = pow(specular, SpecularPower * glosstex[3]);
3912                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3913                         {
3914                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3915                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3916                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3917                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3918                         }
3919                         else
3920                         {
3921                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3922                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3923                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3924                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3925                         }
3926
3927                         buffer_FragColorbgra8[x*4+0] = d[0];
3928                         buffer_FragColorbgra8[x*4+1] = d[1];
3929                         buffer_FragColorbgra8[x*4+2] = d[2];
3930                         buffer_FragColorbgra8[x*4+3] = d[3];
3931                 }
3932         }
3933         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3934         {
3935                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3936                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3937                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3938                 Color_Diffuse[3] = 0.0f;
3939                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3940                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3941                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3942                 LightColor[3] = 0.0f;
3943                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3944
3945                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3946                 {
3947                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3948                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3949                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3950                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3951                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3952                 }
3953                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3954                 {
3955                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3956                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3957                 }
3958                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3959                 {
3960                         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3961                 }
3962                 else
3963                 {
3964                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3965                 }
3966
3967                 for (x = startx;x < endx;x++)
3968                 {
3969                         z = buffer_z[x];
3970                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3971                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3972                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3973                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3974                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3975                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3976                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3977                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3978
3979                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3980                         {
3981                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3982                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3983                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3984                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3985
3986                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3987                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3988                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3989                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3990
3991                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3992                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3993                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3994                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3995
3996                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3997                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3998                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3999                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
4000
4001                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
4002                                 DPSOFTRAST_Vector3Normalize(lightnormal);
4003
4004                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
4005                                 {
4006                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
4007                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
4008                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
4009                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
4010                                 }
4011                         }
4012                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
4013                         {
4014                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4015                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4016                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4017                                 {
4018                                         float f = 1.0f / 256.0f;
4019                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
4020                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
4021                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
4022                                 }
4023                         }
4024                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
4025                         {
4026                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4027                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4028                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4029                                 DPSOFTRAST_Vector3Normalize(lightnormal);
4030
4031                                 LightColor[0] = 1.0;
4032                                 LightColor[1] = 1.0;
4033                                 LightColor[2] = 1.0;
4034                         }
4035                         else
4036                         {
4037                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4038                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4039                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4040                                 DPSOFTRAST_Vector3Normalize(lightnormal);
4041                         }
4042
4043                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4044                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4045                         {
4046                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4047                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4048                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4049                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
4050                         }
4051                         else
4052                         {
4053                                 d[0] = (int)(                                                + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4054                                 d[1] = (int)(                                                + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4055                                 d[2] = (int)(                                                + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4056                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
4057                         }
4058                         buffer_FragColorbgra8[x*4+0] = d[0];
4059                         buffer_FragColorbgra8[x*4+1] = d[1];
4060                         buffer_FragColorbgra8[x*4+2] = d[2];
4061                         buffer_FragColorbgra8[x*4+3] = d[3];
4062                 }
4063         }
4064         else
4065         {
4066                 for (x = startx;x < endx;x++)
4067                 {
4068                         z = buffer_z[x];
4069                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4070                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4071                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4072                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4073
4074                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4075                         {
4076                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4077                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4078                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4079                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4080                         }
4081                         else
4082                         {
4083                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4084                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4085                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4086                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4087                         }
4088                         buffer_FragColorbgra8[x*4+0] = d[0];
4089                         buffer_FragColorbgra8[x*4+1] = d[1];
4090                         buffer_FragColorbgra8[x*4+2] = d[2];
4091                         buffer_FragColorbgra8[x*4+3] = d[3];
4092                 }
4093         }
4094         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4095 }
4096
4097
4098
4099 void DPSOFTRAST_VertexShader_LightSource(void)
4100 {
4101         int i;
4102         int numvertices = dpsoftrast.numvertices;
4103         float LightPosition[4];
4104         float LightVector[4];
4105         float LightVectorModelSpace[4];
4106         float EyePosition[4];
4107         float EyeVectorModelSpace[4];
4108         float EyeVector[4];
4109         float position[4];
4110         float svector[4];
4111         float tvector[4];
4112         float normal[4];
4113         LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4114         LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4115         LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4116         LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4117         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4118         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4119         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4120         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4121         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4122         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4123         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4124         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4125         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4126         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4127         for (i = 0;i < numvertices;i++)
4128         {
4129                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4130                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4131                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4132                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4133                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4134                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4135                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4136                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4137                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4138                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4139                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4140                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4141                 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4142                 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4143                 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4144                 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4145                 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4146                 LightVector[2] = normal[0]  * LightVectorModelSpace[0] + normal[1]  * LightVectorModelSpace[1] + normal[2]  * LightVectorModelSpace[2];
4147                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4148                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4149                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4150                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4151                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4152                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4153                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4154                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4155                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4156                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
4157                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4158                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4159                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4160                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4161         }
4162         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4163         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4164 }
4165
4166 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4167 {
4168 #ifdef SSE_POSSIBLE
4169         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4170         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4171         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4172         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4173         unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4174         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4175         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4176         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4177         int x, startx = span->startx, endx = span->endx;
4178         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4179         float CubeVectordata[4];
4180         float CubeVectorslope[4];
4181         float LightVectordata[4];
4182         float LightVectorslope[4];
4183         float EyeVectordata[4];
4184         float EyeVectorslope[4];
4185         float z;
4186         float diffusetex[4];
4187         float glosstex[4];
4188         float surfacenormal[4];
4189         float lightnormal[4];
4190         float eyenormal[4];
4191         float specularnormal[4];
4192         float diffuse;
4193         float specular;
4194         float SpecularPower;
4195         float CubeVector[4];
4196         float attenuation;
4197         int d[4];
4198         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4199         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4200         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4201         Color_Glow[3] = 0.0f;
4202         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4203         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4204         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4205         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4206         Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4207         Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4208         Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4209         Color_Diffuse[3] = 0.0f;
4210         Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4211         Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4212         Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4213         Color_Specular[3] = 0.0f;
4214         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4215         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4216         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4217         Color_Pants[3] = 0.0f;
4218         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4219         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4220         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4221         Color_Shirt[3] = 0.0f;
4222         LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4223         LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4224         LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4225         LightColor[3] = 0.0f;
4226         SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4227         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4228         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4229         DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4230         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4231         memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4232         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4233         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4234         {
4235                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4236                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4237         }
4238         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4239                 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4240         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4241         {
4242                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4243                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4244                 for (x = startx;x < endx;x++)
4245                 {
4246                         z = buffer_z[x];
4247                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4248                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4249                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4250                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4251                         if (attenuation < 0.01f)
4252                                 continue;
4253                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4254                         {
4255                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4256                                 if (attenuation < 0.01f)
4257                                         continue;
4258                         }
4259
4260                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4261                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4262                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4263                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4264                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4265                         {
4266                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4267                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4268                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4269                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4270                         }
4271                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4272                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4273                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4274                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4275                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4276                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4277                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4278                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4279
4280                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4281                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4282                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4283                         DPSOFTRAST_Vector3Normalize(lightnormal);
4284
4285                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4286
4287                         if(thread->shader_exactspecularmath)
4288                         {
4289                                 // reflect lightnormal at surfacenormal, take the negative of that
4290                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4291                                 float f;
4292                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4293                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4294                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4295                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4296
4297                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
4298                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4299                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4300                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4301                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4302
4303                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4304                         }
4305                         else
4306                         {
4307                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4308                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4309                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4310                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4311
4312                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
4313                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
4314                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
4315                                 DPSOFTRAST_Vector3Normalize(specularnormal);
4316
4317                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4318                         }
4319                         specular = pow(specular, SpecularPower * glosstex[3]);
4320
4321                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4322                         {
4323                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4324                                 attenuation *= (1.0f / 255.0f);
4325                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4326                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4327                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4328                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4329                         }
4330                         else
4331                         {
4332                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4333                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4334                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4335                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4336                         }
4337                         buffer_FragColorbgra8[x*4+0] = d[0];
4338                         buffer_FragColorbgra8[x*4+1] = d[1];
4339                         buffer_FragColorbgra8[x*4+2] = d[2];
4340                         buffer_FragColorbgra8[x*4+3] = d[3];
4341                 }
4342         }
4343         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4344         {
4345                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4346                 for (x = startx;x < endx;x++)
4347                 {
4348                         z = buffer_z[x];
4349                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4350                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4351                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4352                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4353                         if (attenuation < 0.01f)
4354                                 continue;
4355                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4356                         {
4357                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4358                                 if (attenuation < 0.01f)
4359                                         continue;
4360                         }
4361
4362                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4363                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4364                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4365                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4366                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4367                         {
4368                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4369                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4370                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4371                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4372                         }
4373                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4374                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4375                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4376                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4377
4378                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4379                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4380                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4381                         DPSOFTRAST_Vector3Normalize(lightnormal);
4382
4383                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4384                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4385                         {
4386                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4387                                 attenuation *= (1.0f / 255.0f);
4388                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4389                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4390                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4391                                 d[3] = (int)( diffusetex[3]                                                                                                                   );if (d[3] > 255) d[3] = 255;
4392                         }
4393                         else
4394                         {
4395                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4396                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4397                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4398                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4399                         }
4400                         buffer_FragColorbgra8[x*4+0] = d[0];
4401                         buffer_FragColorbgra8[x*4+1] = d[1];
4402                         buffer_FragColorbgra8[x*4+2] = d[2];
4403                         buffer_FragColorbgra8[x*4+3] = d[3];
4404                 }
4405         }
4406         else
4407         {
4408                 for (x = startx;x < endx;x++)
4409                 {
4410                         z = buffer_z[x];
4411                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4412                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4413                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4414                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4415                         if (attenuation < 0.01f)
4416                                 continue;
4417                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4418                         {
4419                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4420                                 if (attenuation < 0.01f)
4421                                         continue;
4422                         }
4423
4424                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4425                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4426                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4427                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4428                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4429                         {
4430                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4431                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4432                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4433                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4434                         }
4435                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4436                         {
4437                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4438                                 attenuation *= (1.0f / 255.0f);
4439                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4440                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4441                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4442                                 d[3] = (int)( diffusetex[3]                                                                                      );if (d[3] > 255) d[3] = 255;
4443                         }
4444                         else
4445                         {
4446                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4447                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4448                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4449                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4450                         }
4451                         buffer_FragColorbgra8[x*4+0] = d[0];
4452                         buffer_FragColorbgra8[x*4+1] = d[1];
4453                         buffer_FragColorbgra8[x*4+2] = d[2];
4454                         buffer_FragColorbgra8[x*4+3] = d[3];
4455                 }
4456         }
4457         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4458 #endif
4459 }
4460
4461
4462
4463 void DPSOFTRAST_VertexShader_Refraction(void)
4464 {
4465         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4466         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4467         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4468 }
4469
4470 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4471 {
4472         // DIRTY TRICK: only do sideways displacement. Not correct, but cheaper and thus better for SW.
4473
4474         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4475         float z;
4476         int x, startx = span->startx, endx = span->endx;
4477
4478         // texture reads
4479         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4480         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4481
4482         // varyings
4483         float ModelViewProjectionPositiondata[4];
4484         float ModelViewProjectionPositionslope[4];
4485
4486         // uniforms
4487         float ScreenScaleRefractReflect[2];
4488         float ScreenCenterRefractReflect[2];
4489         float DistortScaleRefractReflect[2];
4490         float RefractColor[4];
4491
4492         const unsigned char * RESTRICT pixelbase;
4493         const unsigned char * RESTRICT pixel[4];
4494         DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4495         if(!texture) return;
4496         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[0][0];
4497
4498         // read textures
4499         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4500         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4501
4502         // read varyings
4503         DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD1); // or POSITION?
4504
4505         // read uniforms
4506         ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4507         ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4508         ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4509         ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4510         DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4511         DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4512         RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4513         RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4514         RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4515         RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4516
4517         // do stuff
4518         for (x = startx;x < endx;x++)
4519         {
4520                 float SafeScreenTexCoord[2];
4521                 float ScreenTexCoord[2];
4522                 float v[3];
4523                 float iw;
4524                 unsigned char c[4];
4525
4526                 z = buffer_z[x];
4527
4528                 // "    vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4529                 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4530         
4531                 // "    vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4532                 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4533                 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4534
4535                 // "    vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4536                 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4537                 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4538                 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4539                 DPSOFTRAST_Vector3Normalize(v);
4540                 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4541                 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4542
4543                 // "    dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4544                 if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4545                 {
4546                         unsigned int tc[2] = { ScreenTexCoord[0] * (texture->mipmap[0][2]<<12) - 2048, ScreenTexCoord[1] * (texture->mipmap[0][3]<<12) - 2048};
4547                         unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
4548                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
4549                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
4550                         int tci[2] = { tc[0]>>12, tc[1]>>12 };
4551                         int tci1[2] = { tci[0] + 1, tci[1] + 1 };
4552                         tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4553                         tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4554                         tci1[0] = tci1[0] >= 0 ? (tci1[0] <= texture->mipmap[0][2]-1 ? tci1[0] : texture->mipmap[0][2]-1) : 0;
4555                         tci1[1] = tci1[1] >= 0 ? (tci1[1] <= texture->mipmap[0][3]-1 ? tci1[1] : texture->mipmap[0][3]-1) : 0;
4556                         pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4557                         pixel[1] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci1[0]);
4558                         pixel[2] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci[0]);
4559                         pixel[3] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci1[0]);
4560                         c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
4561                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
4562                         c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
4563                 }
4564                 else
4565                 {
4566                         int tci[2] = { ScreenTexCoord[0] * texture->mipmap[0][2], ScreenTexCoord[1] * texture->mipmap[0][3] };
4567                         tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4568                         tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4569                         pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4570                         c[0] = pixel[0][0];
4571                         c[1] = pixel[0][1];
4572                         c[2] = pixel[0][2];
4573                 }
4574
4575                 //p = (int) bound(startx, x + (ScreenTexCoord[0] - SafeScreenTexCoord[0]) / (ModelViewProjectionPositionslope[0]*z), endx-1);
4576                 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4577                 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4578                 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4579                 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4580         }
4581
4582         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4583 }
4584
4585
4586
4587 void DPSOFTRAST_VertexShader_Water(void)
4588 {
4589         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4590 }
4591
4592
4593 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4594 {
4595         // TODO: IMPLEMENT
4596         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4597         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4598         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4599         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4600         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4601 }
4602
4603
4604
4605 void DPSOFTRAST_VertexShader_ShowDepth(void)
4606 {
4607         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4608 }
4609
4610 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4611 {
4612         // TODO: IMPLEMENT
4613         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4614         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4615         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4616         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4617         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4618 }
4619
4620
4621
4622 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4623 {
4624         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4625 }
4626
4627 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4628 {
4629         // TODO: IMPLEMENT
4630         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4631         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4632         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4633         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4634         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4635 }
4636
4637
4638
4639 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4640 {
4641         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4642 }
4643
4644 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4645 {
4646         // TODO: IMPLEMENT
4647         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4648         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4649         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4650         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4651         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4652 }
4653
4654
4655
4656 typedef struct DPSOFTRAST_ShaderModeInfo_s
4657 {
4658         int lodarrayindex;
4659         void (*Vertex)(void);
4660         void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4661         unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4662         unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4663 }
4664 DPSOFTRAST_ShaderModeInfo;
4665
4666 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4667 {
4668         {2, DPSOFTRAST_VertexShader_Generic,                        DPSOFTRAST_PixelShader_Generic,                        {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4669         {2, DPSOFTRAST_VertexShader_PostProcess,                    DPSOFTRAST_PixelShader_PostProcess,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4670         {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow,                DPSOFTRAST_PixelShader_Depth_Or_Shadow,                {~0}, {~0}},
4671         {2, DPSOFTRAST_VertexShader_FlatColor,                      DPSOFTRAST_PixelShader_FlatColor,                      {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4672         {2, DPSOFTRAST_VertexShader_VertexColor,                    DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4673         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4674         {2, DPSOFTRAST_VertexShader_FakeLight,                      DPSOFTRAST_PixelShader_FakeLight,                      {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4675         {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace,   {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4676         {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4677         {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,                 {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4678         {2, DPSOFTRAST_VertexShader_LightSource,                    DPSOFTRAST_PixelShader_LightSource,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4679         {2, DPSOFTRAST_VertexShader_Refraction,                     DPSOFTRAST_PixelShader_Refraction,                     {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4680         {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                          {~0}},
4681         {2, DPSOFTRAST_VertexShader_ShowDepth,                      DPSOFTRAST_PixelShader_ShowDepth,                      {~0}},
4682         {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,               {~0}},
4683         {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}},
4684 };
4685
4686 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4687 {
4688         int i;
4689         int x;
4690         int startx;
4691         int endx;
4692 //      unsigned int c;
4693 //      unsigned int *colorpixel;
4694         unsigned int *depthpixel;
4695         float w;
4696         float wslope;
4697         int depth;
4698         int depthslope;
4699         unsigned int d;
4700         DPSOFTRAST_State_Triangle *triangle;
4701         DPSOFTRAST_State_Span *span;
4702         unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH+4]; // LordHavoc: padded to allow some termination bytes
4703         for (i = 0; i < thread->numspans; i++)
4704         {
4705                 span = &thread->spans[i];
4706                 triangle = &thread->triangles[span->triangle];
4707                 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4708                 {
4709                         wslope = triangle->w[0];
4710                         w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4711                         depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4712                         depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4713                         depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4714                         startx = span->startx;
4715                         endx = span->endx;
4716                         switch(thread->fb_depthfunc)
4717                         {
4718                         default:
4719                         case GL_ALWAYS:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4720                         case GL_LESS:    for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4721                         case GL_LEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4722                         case GL_EQUAL:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4723                         case GL_GEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4724                         case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4725                         case GL_NEVER:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4726                         }
4727                         //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4728                         //for (x = startx;x < endx;x++)
4729                         //      colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4730                         // if there is no color buffer, skip pixel shader
4731                         while (startx < endx && !pixelmask[startx])
4732                                 startx++;
4733                         while (endx > startx && !pixelmask[endx-1])
4734                                 endx--;
4735                         if (startx >= endx)
4736                                 continue; // no pixels to fill
4737                         span->pixelmask = pixelmask;
4738                         span->startx = startx;
4739                         span->endx = endx;
4740                         // run pixel shader if appropriate
4741                         // do this before running depthmask code, to allow the pixelshader
4742                         // to clear pixelmask values for alpha testing
4743                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4744                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4745                         if (thread->depthmask)
4746                                 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4747                                         if (pixelmask[x])
4748                                                 depthpixel[x] = d;
4749                 }
4750                 else
4751                 {
4752                         // no depth testing means we're just dealing with color...
4753                         // if there is no color buffer, skip pixel shader
4754                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4755                         {
4756                                 memset(pixelmask + span->startx, 1, span->endx - span->startx);
4757                                 span->pixelmask = pixelmask;
4758                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4759                         }
4760                 }
4761         }
4762         thread->numspans = 0;
4763 }
4764
4765 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4766
4767 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4768 {
4769 #ifdef SSE_POSSIBLE
4770         int cullface = thread->cullface;
4771         int minx, maxx, miny, maxy;
4772         int miny1, maxy1, miny2, maxy2;
4773         __m128i fbmin, fbmax;
4774         __m128 viewportcenter, viewportscale;
4775         int firstvertex = command->firstvertex;
4776         int numvertices = command->numvertices;
4777         int numtriangles = command->numtriangles;
4778         const int *element3i = command->element3i;
4779         const unsigned short *element3s = command->element3s;
4780         int clipped = command->clipped;
4781         int i;
4782         int j;
4783         int k;
4784         int y;
4785         int e[3];
4786         __m128i screeny;
4787         int starty, endy, bandy;
4788         int numpoints;
4789         int clipcase;
4790         float clipdist[4];
4791         __m128 triangleedge1, triangleedge2, trianglenormal;
4792         __m128 clipfrac[3];
4793         __m128 screen[4];
4794         DPSOFTRAST_State_Triangle *triangle;
4795         DPSOFTRAST_Texture *texture;
4796         DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4797         miny = thread->fb_scissor[1];
4798         maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4799         miny1 = bound(miny, thread->miny1, maxy);
4800         maxy1 = bound(miny, thread->maxy1, maxy);
4801         miny2 = bound(miny, thread->miny2, maxy);
4802         maxy2 = bound(miny, thread->maxy2, maxy);
4803         if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4804         {
4805                 if (!ATOMIC_DECREMENT(command->refcount))
4806                 {
4807                         if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4808                                 MM_FREE(command->arrays);
4809                 }
4810                 return;
4811         }
4812         minx = thread->fb_scissor[0];
4813         maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4814         fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4815         fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4816         viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4817         viewportscale = _mm_load_ps(thread->fb_viewportscale);
4818         screen[3] = _mm_setzero_ps();
4819         clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4820         for (i = 0;i < numtriangles;i++)
4821         {
4822                 const float *screencoord4f = command->arrays;
4823                 const float *arrays = screencoord4f + numvertices*4;
4824
4825                 // generate the 3 edges of this triangle
4826                 // generate spans for the triangle - switch based on left split or right split classification of triangle
4827                 if (element3s)
4828                 {
4829                         e[0] = element3s[i*3+0] - firstvertex;
4830                         e[1] = element3s[i*3+1] - firstvertex;
4831                         e[2] = element3s[i*3+2] - firstvertex;
4832                 }
4833                 else if (element3i)
4834                 {
4835                         e[0] = element3i[i*3+0] - firstvertex;
4836                         e[1] = element3i[i*3+1] - firstvertex;
4837                         e[2] = element3i[i*3+2] - firstvertex;
4838                 }
4839                 else
4840                 {
4841                         e[0] = i*3+0;
4842                         e[1] = i*3+1;
4843                         e[2] = i*3+2;
4844                 }
4845
4846 #define SKIPBACKFACE \
4847                 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4848                 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4849                 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4850                 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4851                                                                         _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4852                 switch(cullface) \
4853                 { \
4854                 case GL_BACK: \
4855                         if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4856                                 continue; \
4857                         break; \
4858                 case GL_FRONT: \
4859                         if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4860                                 continue; \
4861                         break; \
4862                 }
4863
4864 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4865                         clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4866                         { \
4867                                 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4868                                 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4869                         }
4870 #define CLIPPEDVERTEXCOPY(k,p1) \
4871                         screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4872
4873 #define GENATTRIBCOPY(attrib, p1) \
4874                 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4875 #define GENATTRIBLERP(attrib, p1, p2) \
4876                 { \
4877                         __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4878                         attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4879                 }
4880 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4881                 switch(clipcase) \
4882                 { \
4883                 default: \
4884                 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4885                 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4886                 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4887                 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4888                 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4889                 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4890                 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4891                 }
4892
4893                 if (! clipped)
4894                         goto notclipped;
4895
4896                 // calculate distance from nearplane
4897                 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4898                 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4899                 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4900                 if (clipdist[0] >= 0.0f)
4901                 {
4902                         if (clipdist[1] >= 0.0f)
4903                         {
4904                                 if (clipdist[2] >= 0.0f)
4905                                 {
4906                                 notclipped:
4907                                         // triangle is entirely in front of nearplane
4908                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4909                                         SKIPBACKFACE;
4910                                         numpoints = 3;
4911                                         clipcase = 0;
4912                                 }
4913                                 else
4914                                 {
4915                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4916                                         SKIPBACKFACE;
4917                                         numpoints = 4;
4918                                         clipcase = 1;
4919                                 }
4920                         }
4921                         else
4922                         {
4923                                 if (clipdist[2] >= 0.0f)
4924                                 {
4925                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4926                                         SKIPBACKFACE;
4927                                         numpoints = 4;
4928                                         clipcase = 2;
4929                                 }
4930                                 else
4931                                 {
4932                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4933                                         SKIPBACKFACE;
4934                                         numpoints = 3;
4935                                         clipcase = 3;
4936                                 }
4937                         }
4938                 }
4939                 else if (clipdist[1] >= 0.0f)
4940                 {
4941                         if (clipdist[2] >= 0.0f)
4942                         {
4943                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4944                                 SKIPBACKFACE;
4945                                 numpoints = 4;
4946                                 clipcase = 4;
4947                         }
4948                         else
4949                         {
4950                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4951                                 SKIPBACKFACE;
4952                                 numpoints = 3;
4953                                 clipcase = 5;
4954                         }
4955                 }
4956                 else if (clipdist[2] >= 0.0f)
4957                 {
4958                         CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4959                         SKIPBACKFACE;
4960                         numpoints = 3;
4961                         clipcase = 6;
4962                 }
4963                 else continue; // triangle is entirely behind nearplane
4964
4965                 {
4966                         // calculate integer y coords for triangle points
4967                         __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4968                                         screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4969                                         screenmin = _mm_min_epi16(screeni, screenir),
4970                                         screenmax = _mm_max_epi16(screeni, screenir);
4971                         screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4972                         screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4973                         screenmin = _mm_max_epi16(screenmin, fbmin);
4974                         screenmax = _mm_min_epi16(screenmax, fbmax);
4975                         // skip offscreen triangles
4976                         if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4977                                 continue;
4978                         starty = _mm_extract_epi16(screenmin, 1);
4979                         endy = _mm_extract_epi16(screenmax, 1)+1;
4980                         if (starty >= maxy1 && endy <= miny2)
4981                                 continue;
4982                         screeny = _mm_srai_epi32(screeni, 16);
4983                 }
4984
4985                 triangle = &thread->triangles[thread->numtriangles];
4986
4987                 // calculate attribute plans for triangle data...
4988                 // okay, this triangle is going to produce spans, we'd better project
4989                 // the interpolants now (this is what gives perspective texturing),
4990                 // this consists of simply multiplying all arrays by the W coord
4991                 // (which is basically 1/Z), which will be undone per-pixel
4992                 // (multiplying by Z again) to get the perspective-correct array
4993                 // values
4994                 {
4995                         __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4996                         __m128 mipedgescale, mipdensity;
4997                         attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4998                         attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4999                         attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
5000                         attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
5001                         attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
5002                         w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
5003                         w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
5004                         w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
5005                         attribedge1 = _mm_sub_ss(w0, w1);
5006                         attribedge2 = _mm_sub_ss(w2, w1);
5007                         attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5008                         attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5009                         x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
5010                         y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
5011                         attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5012                         _mm_store_ss(&triangle->w[0], attribxslope);
5013                         _mm_store_ss(&triangle->w[1], attribyslope);
5014                         _mm_store_ss(&triangle->w[2], attriborigin);
5015                         mipedgescale = _mm_setzero_ps();
5016                         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
5017                         {
5018                                 __m128 attrib0, attrib1, attrib2;
5019                                 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
5020                                 if (k >= DPSOFTRAST_ARRAY_TOTAL)
5021                                         break;
5022                                 arrays += numvertices*4;
5023                                 GENATTRIBS(attrib0, attrib1, attrib2);
5024                                 attriborigin = _mm_mul_ps(attrib1, w1);
5025                                 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
5026                                 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
5027                                 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
5028                                 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
5029                                 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
5030                                 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
5031                                 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
5032                                 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
5033                                 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
5034                                 {
5035                                         mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
5036                                         mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
5037                                         mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
5038                                         mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
5039                                 }
5040                         }
5041
5042                         memset(triangle->mip, 0, sizeof(triangle->mip));
5043                         for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
5044                         {
5045                                 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
5046                                 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
5047                                         break;
5048                                 texture = thread->texbound[texunit];
5049                                 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
5050                                 {
5051                                         mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
5052                                         mipdensity = _mm_mul_ps(mipdensity, mipdensity);
5053                                         mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
5054                                         mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
5055                                         // this will be multiplied in the texturing routine by the texture resolution
5056                                         y = _mm_cvtss_si32(mipdensity);
5057                                         if (y > 0)
5058                                         {
5059                                                 y = (int)(log((float)y)*0.5f/M_LN2);
5060                                                 if (y > texture->mipmaps - 1)
5061                                                         y = texture->mipmaps - 1;
5062                                                 triangle->mip[texunit] = y;
5063                                         }
5064                                 }
5065                         }
5066                 }
5067         
5068                 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
5069                 for (; y < bandy;)
5070                 {
5071                         __m128 xcoords, xslope;
5072                         __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
5073                         int yccmask = _mm_movemask_epi8(ycc);
5074                         int edge0p, edge0n, edge1p, edge1n;
5075                         int nexty;
5076                         if (numpoints == 4)
5077                         {
5078                                 switch(yccmask)
5079                                 {
5080                                 default:
5081                                 case 0xFFFF: /*0000*/ y = endy; continue;
5082                                 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
5083                                 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5084                                 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
5085                                 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
5086                                 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5087                                 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5088                                 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5089                                 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5090                                 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5091                                 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5092                                 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5093                                 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5094                                 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5095                                 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5096                                 case 0x0000: /*1111*/ y++; continue;
5097                                 }
5098                         }
5099                         else
5100                         {
5101                                 switch(yccmask)
5102                                 {
5103                                 default:
5104                                 case 0xFFFF: /*000*/ y = endy; continue;
5105                                 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5106                                 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5107                                 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5108                                 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5109                                 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5110                                 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5111                                 case 0x0000: /*111*/ y++; continue;
5112                                 }
5113                         }
5114                         ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5115                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5116                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5117                         nexty = _mm_extract_epi16(ycc, 0);
5118                         if (nexty >= bandy) nexty = bandy-1;
5119                         xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5120                         xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5121                         xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5122                                                                 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5123                         xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5124                         if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5125                         {
5126                                 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5127                                 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5128                         }
5129                         for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
5130                         {
5131                                 int startx, endx, offset;
5132                                 startx = _mm_cvtss_si32(xcoords);
5133                                 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5134                                 if (startx < minx) 
5135                                 {
5136                                         if (startx < 0) startx = 0;
5137                                         startx += (minx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
5138                                 }
5139                                 if (endx > maxx) endx = maxx;
5140                                 if (startx >= endx) continue;
5141                                 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5142                                 {
5143                                         DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5144                                         span->triangle = thread->numtriangles;
5145                                         span->x = offset;
5146                                         span->y = y;
5147                                         span->startx = max(minx - offset, 0);
5148                                         span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5149                                         if (span->startx >= span->endx)
5150                                                 continue; 
5151                                         if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5152                                                 DPSOFTRAST_Draw_ProcessSpans(thread);
5153                                 }
5154                         }
5155                 }
5156
5157                 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5158                 {
5159                         DPSOFTRAST_Draw_ProcessSpans(thread);
5160                         thread->numtriangles = 0;
5161                 }
5162         }
5163
5164         if (!ATOMIC_DECREMENT(command->refcount))
5165         {
5166                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5167                         MM_FREE(command->arrays);
5168         }
5169
5170         if (thread->numspans > 0 || thread->numtriangles > 0)
5171         {
5172                 DPSOFTRAST_Draw_ProcessSpans(thread);
5173                 thread->numtriangles = 0;
5174         }
5175 #endif
5176 }
5177
5178 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5179 {
5180         int i;
5181         int j;
5182         int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5183         int datasize = 2*numvertices*sizeof(float[4]);
5184         DPSOFTRAST_Command_Draw *command;
5185         unsigned char *data;
5186         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5187         {
5188                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5189                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5190                         break;
5191                 datasize += numvertices*sizeof(float[4]);
5192         }
5193         if (element3s)
5194                 datasize += numtriangles*sizeof(unsigned short[3]);
5195         else if (element3i)
5196                 datasize += numtriangles*sizeof(int[3]);
5197         datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5198         if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5199         {
5200                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5201                 data = (unsigned char *)MM_CALLOC(datasize, 1);
5202         }
5203         else
5204         {
5205                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5206                 data = (unsigned char *)command + commandsize;
5207         }
5208         command->firstvertex = firstvertex;
5209         command->numvertices = numvertices;
5210         command->numtriangles = numtriangles;
5211         command->arrays = (float *)data;
5212         memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5213         dpsoftrast.firstvertex = firstvertex;
5214         dpsoftrast.numvertices = numvertices;
5215         dpsoftrast.screencoord4f = (float *)data;
5216         data += numvertices*sizeof(float[4]);
5217         dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5218         data += numvertices*sizeof(float[4]);
5219         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5220         {
5221                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5222                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5223                         break;
5224                 dpsoftrast.post_array4f[j] = (float *)data;
5225                 data += numvertices*sizeof(float[4]);
5226         }
5227         command->element3i = NULL;
5228         command->element3s = NULL;
5229         if (element3s)
5230         {
5231                 command->element3s = (unsigned short *)data;
5232                 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5233         }
5234         else if (element3i)
5235         {
5236                 command->element3i = (int *)data;
5237                 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5238         }
5239         return command;
5240 }
5241
5242 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5243 {
5244         DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5245         DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5246         command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5247         command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5248         if (command->starty >= command->endy)
5249         {
5250                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5251                         MM_FREE(command->arrays);
5252                 DPSOFTRAST_UndoCommand(command->commandsize);
5253                 return;
5254         }
5255         command->clipped = dpsoftrast.drawclipped;
5256         command->refcount = dpsoftrast.numthreads;
5257
5258         if (dpsoftrast.usethreads)
5259         {
5260                 int i;
5261                 DPSOFTRAST_Draw_SyncCommands();
5262                 for (i = 0; i < dpsoftrast.numthreads; i++)
5263                 {
5264                         DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5265                         if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5266                                 Thread_CondSignal(thread->drawcond);
5267                 }
5268         }
5269         else
5270         {
5271                 DPSOFTRAST_Draw_FlushThreads();
5272         }
5273 }
5274
5275 DEFCOMMAND(23, SetRenderTargets, int width; int height;);
5276 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5277 {
5278         thread->validate |= DPSOFTRAST_VALIDATE_FB;
5279 }
5280 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5281 {
5282         DPSOFTRAST_Command_SetRenderTargets *command;
5283         if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5284                 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5285                 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5286                 DPSOFTRAST_Flush();
5287         dpsoftrast.fb_width = width;
5288         dpsoftrast.fb_height = height;
5289         dpsoftrast.fb_depthpixels = depthpixels;
5290         dpsoftrast.fb_colorpixels[0] = colorpixels0;
5291         dpsoftrast.fb_colorpixels[1] = colorpixels1;
5292         dpsoftrast.fb_colorpixels[2] = colorpixels2;
5293         dpsoftrast.fb_colorpixels[3] = colorpixels3;
5294         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5295         command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5296         command->width = width;
5297         command->height = height;
5298 }
5299  
5300 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5301 {
5302         int commandoffset = thread->commandoffset;
5303         while (commandoffset != endoffset)
5304         {
5305                 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5306                 switch (command->opcode)
5307                 {
5308 #define INTERPCOMMAND(name) \
5309                 case DPSOFTRAST_OPCODE_##name : \
5310                         DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5311                         commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5312                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5313                                 commandoffset = 0; \
5314                         break;
5315                 INTERPCOMMAND(Viewport)
5316                 INTERPCOMMAND(ClearColor)
5317                 INTERPCOMMAND(ClearDepth)
5318                 INTERPCOMMAND(ColorMask)
5319                 INTERPCOMMAND(DepthTest)
5320                 INTERPCOMMAND(ScissorTest)
5321                 INTERPCOMMAND(Scissor)
5322                 INTERPCOMMAND(BlendFunc)
5323                 INTERPCOMMAND(BlendSubtract)
5324                 INTERPCOMMAND(DepthMask)
5325                 INTERPCOMMAND(DepthFunc)
5326                 INTERPCOMMAND(DepthRange)
5327                 INTERPCOMMAND(PolygonOffset)
5328                 INTERPCOMMAND(CullFace)
5329                 INTERPCOMMAND(AlphaTest)
5330                 INTERPCOMMAND(AlphaFunc)
5331                 INTERPCOMMAND(SetTexture)
5332                 INTERPCOMMAND(SetShader)
5333                 INTERPCOMMAND(Uniform4f)
5334                 INTERPCOMMAND(UniformMatrix4f)
5335                 INTERPCOMMAND(Uniform1i)
5336                 INTERPCOMMAND(SetRenderTargets)
5337
5338                 case DPSOFTRAST_OPCODE_Draw:
5339                         DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5340                         commandoffset += command->commandsize;
5341                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5342                                 commandoffset = 0;
5343                         thread->commandoffset = commandoffset;
5344                         break;
5345
5346                 case DPSOFTRAST_OPCODE_Reset:
5347                         commandoffset = 0;
5348                         break;
5349                 }
5350         }
5351         thread->commandoffset = commandoffset;
5352 }
5353
5354 static int DPSOFTRAST_Draw_Thread(void *data)
5355 {
5356         DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5357         while(thread->index >= 0)
5358         {
5359                 if (thread->commandoffset != dpsoftrast.drawcommand)
5360                 {
5361                         DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);      
5362                 }
5363                 else 
5364                 {
5365                         Thread_LockMutex(thread->drawmutex);
5366                         if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5367                         {
5368                                 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5369                                 thread->starving = true;
5370                                 Thread_CondWait(thread->drawcond, thread->drawmutex);
5371                                 thread->starving = false;
5372                         }
5373                         Thread_UnlockMutex(thread->drawmutex);
5374                 }
5375         }   
5376         return 0;
5377 }
5378
5379 static void DPSOFTRAST_Draw_FlushThreads(void)
5380 {
5381         DPSOFTRAST_State_Thread *thread;
5382         int i;
5383         DPSOFTRAST_Draw_SyncCommands();
5384         if (dpsoftrast.usethreads) 
5385         {
5386                 for (i = 0; i < dpsoftrast.numthreads; i++)
5387                 {
5388                         thread = &dpsoftrast.threads[i];
5389                         if (thread->commandoffset != dpsoftrast.drawcommand)
5390                         {
5391                                 Thread_LockMutex(thread->drawmutex);
5392                                 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5393                                         Thread_CondSignal(thread->drawcond);
5394                                 Thread_UnlockMutex(thread->drawmutex);
5395                         }
5396                 }
5397                 for (i = 0; i < dpsoftrast.numthreads; i++)
5398                 {
5399                         thread = &dpsoftrast.threads[i];
5400                         if (thread->commandoffset != dpsoftrast.drawcommand)
5401                         {
5402                                 Thread_LockMutex(thread->drawmutex);
5403                                 if (thread->commandoffset != dpsoftrast.drawcommand)
5404                                 {
5405                                         thread->waiting = true;
5406                                         Thread_CondWait(thread->waitcond, thread->drawmutex);
5407                                         thread->waiting = false;
5408                                 }
5409                                 Thread_UnlockMutex(thread->drawmutex);
5410                         }
5411                 }
5412         }
5413         else
5414         {
5415                 for (i = 0; i < dpsoftrast.numthreads; i++)
5416                 {
5417                         thread = &dpsoftrast.threads[i];
5418                         if (thread->commandoffset != dpsoftrast.drawcommand)
5419                                 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5420                 }
5421         }
5422         dpsoftrast.commandpool.usedcommands = 0;
5423 }
5424
5425 void DPSOFTRAST_Flush(void)
5426 {
5427         DPSOFTRAST_Draw_FlushThreads();
5428 }
5429
5430 void DPSOFTRAST_Finish(void)
5431 {
5432         DPSOFTRAST_Flush();
5433 }
5434
5435 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5436 {
5437         int i;
5438         union
5439         {
5440                 int i;
5441                 unsigned char b[4];
5442         }
5443         u;
5444         u.i = 1;
5445         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5446         dpsoftrast.bigendian = u.b[3];
5447         dpsoftrast.fb_width = width;
5448         dpsoftrast.fb_height = height;
5449         dpsoftrast.fb_depthpixels = depthpixels;
5450         dpsoftrast.fb_colorpixels[0] = colorpixels;
5451         dpsoftrast.fb_colorpixels[1] = NULL;
5452         dpsoftrast.fb_colorpixels[1] = NULL;
5453         dpsoftrast.fb_colorpixels[1] = NULL;
5454         dpsoftrast.viewport[0] = 0;
5455         dpsoftrast.viewport[1] = 0;
5456         dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5457         dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5458         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5459         dpsoftrast.texture_firstfree = 1;
5460         dpsoftrast.texture_end = 1;
5461         dpsoftrast.texture_max = 0;
5462         dpsoftrast.color[0] = 1;
5463         dpsoftrast.color[1] = 1;
5464         dpsoftrast.color[2] = 1;
5465         dpsoftrast.color[3] = 1;
5466         dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5467         dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5468         dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5469         dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5470         for (i = 0; i < dpsoftrast.numthreads; i++)
5471         {
5472                 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5473                 thread->index = i;
5474                 thread->cullface = GL_BACK;
5475                 thread->colormask[1] = 1;
5476                 thread->colormask[2] = 1;
5477                 thread->colormask[3] = 1;
5478                 thread->blendfunc[0] = GL_ONE;
5479                 thread->blendfunc[1] = GL_ZERO;
5480                 thread->depthmask = true;
5481                 thread->depthtest = true;
5482                 thread->depthfunc = GL_LEQUAL;
5483                 thread->scissortest = false;
5484                 thread->alphatest = false;
5485                 thread->alphafunc = GL_GREATER;
5486                 thread->alphavalue = 0.5f;
5487                 thread->viewport[0] = 0;
5488                 thread->viewport[1] = 0;
5489                 thread->viewport[2] = dpsoftrast.fb_width;
5490                 thread->viewport[3] = dpsoftrast.fb_height;
5491                 thread->scissor[0] = 0;
5492                 thread->scissor[1] = 0;
5493                 thread->scissor[2] = dpsoftrast.fb_width;
5494                 thread->scissor[3] = dpsoftrast.fb_height;
5495                 thread->depthrange[0] = 0;
5496                 thread->depthrange[1] = 1;
5497                 thread->polygonoffset[0] = 0;
5498                 thread->polygonoffset[1] = 0;
5499         
5500                 DPSOFTRAST_RecalcThread(thread);
5501         
5502                 thread->numspans = 0;
5503                 thread->numtriangles = 0;
5504                 thread->commandoffset = 0;
5505                 thread->waiting = false;
5506                 thread->starving = false;
5507            
5508                 thread->validate = -1;
5509                 DPSOFTRAST_Validate(thread, -1);
5510  
5511                 if (dpsoftrast.usethreads)
5512                 {
5513                         thread->waitcond = Thread_CreateCond();
5514                         thread->drawcond = Thread_CreateCond();
5515                         thread->drawmutex = Thread_CreateMutex();
5516                         thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5517                 }
5518         }
5519         return 0;
5520 }
5521
5522 void DPSOFTRAST_Shutdown(void)
5523 {
5524         int i;
5525         if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5526         {
5527                 DPSOFTRAST_State_Thread *thread;
5528                 for (i = 0; i < dpsoftrast.numthreads; i++)
5529                 {
5530                         thread = &dpsoftrast.threads[i];
5531                         Thread_LockMutex(thread->drawmutex);
5532                         thread->index = -1;
5533                         Thread_CondSignal(thread->drawcond);
5534                         Thread_UnlockMutex(thread->drawmutex);
5535                         Thread_WaitThread(thread->thread, 0);
5536                         Thread_DestroyCond(thread->waitcond);
5537                         Thread_DestroyCond(thread->drawcond);
5538                         Thread_DestroyMutex(thread->drawmutex);
5539                 }
5540         }
5541         for (i = 0;i < dpsoftrast.texture_end;i++)
5542                 if (dpsoftrast.texture[i].bytes)
5543                         MM_FREE(dpsoftrast.texture[i].bytes);
5544         if (dpsoftrast.texture)
5545                 free(dpsoftrast.texture);
5546         if (dpsoftrast.threads)
5547                 MM_FREE(dpsoftrast.threads);
5548         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5549 }
5550