]> git.xonotic.org Git - xonotic/darkplaces.git/blob - dpsoftrast.c
fix row order of screen to texture copies
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "thread.h"
7 #include "dpsoftrast.h"
8
9 #ifndef __cplusplus
10 typedef qboolean bool;
11 #endif
12
13 #define ALIGN_SIZE 16
14 #define ATOMIC_SIZE 32
15
16 #ifdef SSE2_PRESENT
17         #if defined(__APPLE__)
18                 #include <libkern/OSAtomic.h>
19                 #define ALIGN(var) var __attribute__((__aligned__(16)))
20                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
21                 #define MEMORY_BARRIER (_mm_sfence())
22                 #define ATOMIC_COUNTER volatile int32_t 
23                 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
24                 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
25                 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
26         #elif defined(__GNUC__)
27                 #define ALIGN(var) var __attribute__((__aligned__(16)))
28                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
29                 #define MEMORY_BARRIER (_mm_sfence())
30                 //(__sync_synchronize())
31                 #define ATOMIC_COUNTER volatile int
32                 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
33                 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
34                 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
35         #elif defined(_MSC_VER)
36                 #define ALIGN(var) __declspec(align(16)) var
37                 #define ATOMIC(var) __declspec(align(32)) var
38                 #define MEMORY_BARRIER (_mm_sfence())
39                 //(MemoryBarrier())
40                 #define ATOMIC_COUNTER volatile LONG
41                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
42                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
43                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
44         #endif
45 #endif
46
47 #ifndef ALIGN
48 #define ALIGN(var) var
49 #endif
50 #ifndef ATOMIC
51 #define ATOMIC(var) var
52 #endif
53 #ifndef MEMORY_BARRIER
54 #define MEMORY_BARRIER ((void)0)
55 #endif
56 #ifndef ATOMIC_COUNTER
57 #define ATOMIC_COUNTER int
58 #endif
59 #ifndef ATOMIC_INCREMENT
60 #define ATOMIC_INCREMENT(counter) (++(counter))
61 #endif
62 #ifndef ATOMIC_DECREMENT
63 #define ATOMIC_DECREMENT(counter) (--(counter))
64 #endif
65 #ifndef ATOMIC_ADD
66 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
67 #endif
68
69 #ifdef SSE2_PRESENT
70 #include <emmintrin.h>
71
72 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
73
74 static void *MM_CALLOC(size_t nmemb, size_t size)
75 {
76         void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
77         if (ptr != NULL) memset(ptr, 0, nmemb*size);
78         return ptr;
79 }
80
81 #define MM_FREE _mm_free
82 #else
83 #define MM_MALLOC(size) malloc(size)
84 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
85 #define MM_FREE free
86 #endif
87
88 typedef enum DPSOFTRAST_ARRAY_e
89 {
90         DPSOFTRAST_ARRAY_POSITION,
91         DPSOFTRAST_ARRAY_COLOR,
92         DPSOFTRAST_ARRAY_TEXCOORD0,
93         DPSOFTRAST_ARRAY_TEXCOORD1,
94         DPSOFTRAST_ARRAY_TEXCOORD2,
95         DPSOFTRAST_ARRAY_TEXCOORD3,
96         DPSOFTRAST_ARRAY_TEXCOORD4,
97         DPSOFTRAST_ARRAY_TEXCOORD5,
98         DPSOFTRAST_ARRAY_TEXCOORD6,
99         DPSOFTRAST_ARRAY_TEXCOORD7,
100         DPSOFTRAST_ARRAY_TOTAL
101 }
102 DPSOFTRAST_ARRAY;
103
104 typedef struct DPSOFTRAST_Texture_s
105 {
106         int flags;
107         int width;
108         int height;
109         int depth;
110         int sides;
111         DPSOFTRAST_TEXTURE_FILTER filter;
112         int mipmaps;
113         int size;
114         ATOMIC_COUNTER binds;
115         unsigned char *bytes;
116         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
117 }
118 DPSOFTRAST_Texture;
119
120 #define COMMAND_SIZE ALIGN_SIZE
121 #define COMMAND_ALIGN(var) ALIGN(var)
122
123 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
124 {
125         unsigned char opcode;
126         unsigned short commandsize;
127 }
128 DPSOFTRAST_Command);
129
130 enum { DPSOFTRAST_OPCODE_Reset = 0 };
131
132 #define DEFCOMMAND(opcodeval, name, fields) \
133         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
134         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
135         { \
136                 unsigned char opcode; \
137                 unsigned short commandsize; \
138                 fields \
139         } DPSOFTRAST_Command_##name );
140
141 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
142 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
143
144 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
145 {
146         int freecommand;
147         int usedcommands;
148         ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
149 }
150 DPSOFTRAST_State_Command_Pool);
151
152 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
153 {
154         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
155         float w[3];
156         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
157 }
158 DPSOFTRAST_State_Triangle);
159
160 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
161         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
162         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
163                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
164                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
165 }
166 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
167         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
168         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
169         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
170         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
171         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
172         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
173         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
174         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
175 }
176                                         
177 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
178
179 typedef ALIGN(struct DPSOFTRAST_State_Span_s
180 {
181         int triangle; // triangle this span was generated by
182         int x; // framebuffer x coord
183         int y; // framebuffer y coord
184         int startx; // usable range (according to pixelmask)
185         int endx; // usable range (according to pixelmask)
186         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
187 }
188 DPSOFTRAST_State_Span);
189
190 #define DPSOFTRAST_DRAW_MAXSPANS 1024
191 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
192
193 #define DPSOFTRAST_VALIDATE_FB 1
194 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
195 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
196 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
197
198 typedef enum DPSOFTRAST_BLENDMODE_e
199 {
200         DPSOFTRAST_BLENDMODE_OPAQUE,
201         DPSOFTRAST_BLENDMODE_ALPHA,
202         DPSOFTRAST_BLENDMODE_ADDALPHA,
203         DPSOFTRAST_BLENDMODE_ADD,
204         DPSOFTRAST_BLENDMODE_INVMOD,
205         DPSOFTRAST_BLENDMODE_MUL,
206         DPSOFTRAST_BLENDMODE_MUL2,
207         DPSOFTRAST_BLENDMODE_SUBALPHA,
208         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
209         DPSOFTRAST_BLENDMODE_INVADD,
210         DPSOFTRAST_BLENDMODE_TOTAL
211 }
212 DPSOFTRAST_BLENDMODE;
213
214 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
215 {
216         void *thread;
217         int index;
218         
219         int cullface;
220         int colormask[4];
221         int blendfunc[2];
222         int blendsubtract;
223         int depthmask;
224         int depthtest;
225         int depthfunc;
226         int scissortest;
227         int alphatest;
228         int alphafunc;
229         float alphavalue;
230         int viewport[4];
231         int scissor[4];
232         float depthrange[2];
233         float polygonoffset[2];
234
235         int shader_mode;
236         int shader_permutation;
237
238         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
239         
240         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
241         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
242
243         // DPSOFTRAST_VALIDATE_ flags
244         int validate;
245
246         // derived values (DPSOFTRAST_VALIDATE_FB)
247         int fb_colormask;
248         int fb_scissor[4];
249         ALIGN(float fb_viewportcenter[4]);
250         ALIGN(float fb_viewportscale[4]);
251
252         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
253         int fb_depthfunc;
254
255         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
256         int fb_blendmode;
257
258         // band boundaries
259         int miny1;
260         int maxy1;
261         int miny2;
262         int maxy2;
263
264         ATOMIC(volatile int commandoffset);
265
266         volatile bool waiting;
267         volatile bool starving;
268         void *waitcond;
269         void *drawcond;
270         void *drawmutex;
271
272         int numspans;
273         int numtriangles;
274         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
275         DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
276 }
277 DPSOFTRAST_State_Thread);
278
279 typedef ATOMIC(struct DPSOFTRAST_State_s
280 {
281         int fb_width;
282         int fb_height;
283         unsigned int *fb_depthpixels;
284         unsigned int *fb_colorpixels[4];
285
286         int viewport[4];
287         ALIGN(float fb_viewportcenter[4]);
288         ALIGN(float fb_viewportscale[4]);
289
290         float color[4];
291         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
292         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
293
294         const float *pointer_vertex3f;
295         const float *pointer_color4f;
296         const unsigned char *pointer_color4ub;
297         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
298         int stride_vertex;
299         int stride_color;
300         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
301         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
302         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
303
304         int firstvertex;
305         int numvertices;
306         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
307         float *screencoord4f;
308         int drawstarty;
309         int drawendy;
310         int drawclipped;
311         
312         int shader_mode;
313         int shader_permutation;
314
315         int texture_max;
316         int texture_end;
317         int texture_firstfree;
318         DPSOFTRAST_Texture *texture;
319
320         int bigendian;
321
322         // error reporting
323         const char *errorstring;
324
325         bool usethreads;
326         int interlace;
327         int numthreads;
328         DPSOFTRAST_State_Thread *threads;
329
330         ATOMIC(volatile int drawcommand);
331
332         DPSOFTRAST_State_Command_Pool commandpool;
333 }
334 DPSOFTRAST_State);
335
336 DPSOFTRAST_State dpsoftrast;
337
338 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
339 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
340 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
341 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
342 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
343
344 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
345 {
346         fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
347         fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
348         fb_viewportcenter[3] = 0.5f;
349         fb_viewportcenter[0] = 0.0f;
350         fb_viewportscale[1] = 0.5f * viewport[2];
351         fb_viewportscale[2] = -0.5f * viewport[3];
352         fb_viewportscale[3] = 0.5f;
353         fb_viewportscale[0] = 1.0f;
354 }
355
356 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
357 {
358         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
359         // and viewport projection values
360         int x1, x2;
361         int y1, y2;
362         x1 = thread->scissor[0];
363         x2 = thread->scissor[0] + thread->scissor[2];
364         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
365         y2 = dpsoftrast.fb_height - thread->scissor[1];
366         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
367         if (x1 < 0) x1 = 0;
368         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
369         if (y1 < 0) y1 = 0;
370         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
371         thread->fb_scissor[0] = x1;
372         thread->fb_scissor[1] = y1;
373         thread->fb_scissor[2] = x2 - x1;
374         thread->fb_scissor[3] = y2 - y1;
375
376         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
377 }
378
379 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
380 {
381         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
382 }
383
384 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
385 {
386         if (thread->blendsubtract)
387         {
388                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
389                 {
390                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
391                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
392                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
393                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
394                 }
395         }
396         else
397         {       
398                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
399                 {
400                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
401                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
402                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
403                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
404                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
405                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
406                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
407                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
408                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
409                 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
410                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
411                 }
412         }
413 }
414
415 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
416
417 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
418 {
419         mask &= thread->validate;
420         if (!mask)
421                 return;
422         if (mask & DPSOFTRAST_VALIDATE_FB)
423         {
424                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
425                 DPSOFTRAST_RecalcFB(thread);
426         }
427         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
428         {
429                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
430                 DPSOFTRAST_RecalcDepthFunc(thread);
431         }
432         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
433         {
434                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
435                 DPSOFTRAST_RecalcBlendFunc(thread);
436         }
437 }
438
439 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
440 {
441         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
442                 return &dpsoftrast.texture[index];
443         return NULL;
444 }
445
446 static void DPSOFTRAST_Texture_Grow(void)
447 {
448         DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
449         DPSOFTRAST_State_Thread *thread;
450         int i;
451         int j;
452         DPSOFTRAST_Flush();
453         // expand texture array as needed
454         if (dpsoftrast.texture_max < 1024)
455                 dpsoftrast.texture_max = 1024;
456         else
457                 dpsoftrast.texture_max *= 2;
458         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
459         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
460                 if (dpsoftrast.texbound[i])
461                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
462         for (j = 0; j < dpsoftrast.numthreads; j++)
463         {
464                 thread = &dpsoftrast.threads[j];
465                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
466                         if (thread->texbound[i])
467                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
468         }
469 }
470
471 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
472 {
473         int w;
474         int h;
475         int d;
476         int size;
477         int s;
478         int texnum;
479         int mipmaps;
480         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
481         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
482         DPSOFTRAST_Texture *texture;
483         if (width*height*depth < 1)
484         {
485                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
486                 return 0;
487         }
488         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
489         {
490                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
491                 return 0;
492         }
493         switch(texformat)
494         {
495         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
496         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
497         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
498                 break;
499         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
500                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
501                 {
502                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
503                         return 0;
504                 }
505                 if (depth != 1)
506                 {
507                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
508                         return 0;
509                 }
510                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
511                 {
512                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
513                         return 0;
514                 }
515                 break;
516         }
517         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
518         {
519                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
520                 return 0;
521         }
522         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
523         {
524                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
525                 return 0;
526         }
527         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
528         {
529                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
530                 return 0;
531         }
532         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
533         {
534                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
535                 return 0;
536         }
537         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
538         {
539                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
540                 return 0;
541         }
542         // find first empty slot in texture array
543         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
544                 if (!dpsoftrast.texture[texnum].bytes)
545                         break;
546         dpsoftrast.texture_firstfree = texnum + 1;
547         if (dpsoftrast.texture_max <= texnum)
548                 DPSOFTRAST_Texture_Grow();
549         if (dpsoftrast.texture_end <= texnum)
550                 dpsoftrast.texture_end = texnum + 1;
551         texture = &dpsoftrast.texture[texnum];
552         memset(texture, 0, sizeof(*texture));
553         texture->flags = flags;
554         texture->width = width;
555         texture->height = height;
556         texture->depth = depth;
557         texture->sides = sides;
558         texture->binds = 0;
559         w = width;
560         h = height;
561         d = depth;
562         size = 0;
563         mipmaps = 0;
564         w = width;
565         h = height;
566         d = depth;
567         for (;;)
568         {
569                 s = w * h * d * sides * 4;
570                 texture->mipmap[mipmaps][0] = size;
571                 texture->mipmap[mipmaps][1] = s;
572                 texture->mipmap[mipmaps][2] = w;
573                 texture->mipmap[mipmaps][3] = h;
574                 texture->mipmap[mipmaps][4] = d;
575                 size += s;
576                 mipmaps++;
577                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
578                         break;
579                 if (w > 1) w >>= 1;
580                 if (h > 1) h >>= 1;
581                 if (d > 1) d >>= 1;
582         }
583         texture->mipmaps = mipmaps;
584         texture->size = size;
585
586         // allocate the pixels now
587         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
588
589         return texnum;
590 }
591 void DPSOFTRAST_Texture_Free(int index)
592 {
593         DPSOFTRAST_Texture *texture;
594         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
595         if (texture->binds)
596                 DPSOFTRAST_Flush();
597         if (texture->bytes)
598                 MM_FREE(texture->bytes);
599         texture->bytes = NULL;
600         memset(texture, 0, sizeof(*texture));
601         // adjust the free range and used range
602         if (dpsoftrast.texture_firstfree > index)
603                 dpsoftrast.texture_firstfree = index;
604         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
605                 dpsoftrast.texture_end--;
606 }
607 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
608 {
609         int i, x, y, z, w, layer0, layer1, row0, row1;
610         unsigned char *o, *i0, *i1, *i2, *i3;
611         DPSOFTRAST_Texture *texture;
612         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
613         if (texture->mipmaps <= 1)
614                 return;
615         for (i = 1;i < texture->mipmaps;i++)
616         {
617                 for (z = 0;z < texture->mipmap[i][4];z++)
618                 {
619                         layer0 = z*2;
620                         layer1 = z*2+1;
621                         if (layer1 >= texture->mipmap[i-1][4])
622                                 layer1 = texture->mipmap[i-1][4]-1;
623                         for (y = 0;y < texture->mipmap[i][3];y++)
624                         {
625                                 row0 = y*2;
626                                 row1 = y*2+1;
627                                 if (row1 >= texture->mipmap[i-1][3])
628                                         row1 = texture->mipmap[i-1][3]-1;
629                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
630                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
631                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
632                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
633                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
634                                 w = texture->mipmap[i][2];
635                                 if (layer1 > layer0)
636                                 {
637                                         if (texture->mipmap[i-1][2] > 1)
638                                         {
639                                                 // average 3D texture
640                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
641                                                 {
642                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
643                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
644                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
645                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
646                                                 }
647                                         }
648                                         else
649                                         {
650                                                 // average 3D mipmap with parent width == 1
651                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
652                                                 {
653                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
654                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
655                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
656                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
657                                                 }
658                                         }
659                                 }
660                                 else
661                                 {
662                                         if (texture->mipmap[i-1][2] > 1)
663                                         {
664                                                 // average 2D texture (common case)
665                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
666                                                 {
667                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
668                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
669                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
670                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
671                                                 }
672                                         }
673                                         else
674                                         {
675                                                 // 2D texture with parent width == 1
676                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
677                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
678                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
679                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
680                                         }
681                                 }
682                         }
683                 }
684         }
685 }
686 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
687 {
688         DPSOFTRAST_Texture *texture;
689         unsigned char *dst;
690         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
691         if (texture->binds)
692                 DPSOFTRAST_Flush();
693         dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
694         while (blockheight > 0)
695         {
696                 memcpy(dst, pixels, blockwidth * 4);
697                 pixels += blockwidth * 4;
698                 dst += texture->mipmap[0][2] * 4;
699                 blockheight--;
700         }
701         DPSOFTRAST_Texture_CalculateMipmaps(index);
702 }
703 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
704 {
705         DPSOFTRAST_Texture *texture;
706         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
707         if (texture->binds)
708                 DPSOFTRAST_Flush();
709         memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
710         DPSOFTRAST_Texture_CalculateMipmaps(index);
711 }
712 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
713 {
714         DPSOFTRAST_Texture *texture;
715         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
716         return texture->mipmap[mip][2];
717 }
718 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
719 {
720         DPSOFTRAST_Texture *texture;
721         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
722         return texture->mipmap[mip][3];
723 }
724 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
725 {
726         DPSOFTRAST_Texture *texture;
727         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
728         return texture->mipmap[mip][4];
729 }
730 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
731 {
732         DPSOFTRAST_Texture *texture;
733         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
734         if (texture->binds)
735                 DPSOFTRAST_Flush();
736         return texture->bytes + texture->mipmap[mip][0];
737 }
738 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
739 {
740         DPSOFTRAST_Texture *texture;
741         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
742         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
743         {
744                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
745                 return;
746         }
747         if (texture->binds)
748                 DPSOFTRAST_Flush();
749         texture->filter = filter;
750 }
751
752 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
753 {
754         if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
755                 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
756                 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
757                 DPSOFTRAST_Flush();
758         dpsoftrast.fb_width = width;
759         dpsoftrast.fb_height = height;
760         dpsoftrast.fb_depthpixels = depthpixels;
761         dpsoftrast.fb_colorpixels[0] = colorpixels0;
762         dpsoftrast.fb_colorpixels[1] = colorpixels1;
763         dpsoftrast.fb_colorpixels[2] = colorpixels2;
764         dpsoftrast.fb_colorpixels[3] = colorpixels3;
765 }
766
767 static void DPSOFTRAST_Draw_FlushThreads(void);
768
769 static void DPSOFTRAST_Draw_SyncCommands(void)
770 {
771         if(dpsoftrast.usethreads) MEMORY_BARRIER;
772         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
773 }
774
775 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
776 {
777         DPSOFTRAST_State_Thread *thread;
778         int i;
779         int freecommand = dpsoftrast.commandpool.freecommand;
780         int usedcommands = dpsoftrast.commandpool.usedcommands;
781         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
782                 return;
783         DPSOFTRAST_Draw_SyncCommands();
784         for(;;)
785         {
786                 int waitindex = -1;
787                 int commandoffset;
788                 usedcommands = 0;
789                 for (i = 0; i < dpsoftrast.numthreads; i++)
790                 {
791                         thread = &dpsoftrast.threads[i]; 
792                         commandoffset = freecommand - thread->commandoffset;
793                         if (commandoffset < 0)
794                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
795                         if (commandoffset > usedcommands)
796                         {
797                                 waitindex = i;
798                                 usedcommands = commandoffset;
799                         }
800                 }
801                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
802                         break;
803                 thread = &dpsoftrast.threads[waitindex];
804                 Thread_LockMutex(thread->drawmutex);
805                 if (thread->commandoffset != dpsoftrast.drawcommand)
806                 {
807                         thread->waiting = true;
808                         if (thread->starving) Thread_CondSignal(thread->drawcond);
809                         Thread_CondWait(thread->waitcond, thread->drawmutex);
810                         thread->waiting = false;
811                 }
812                 Thread_UnlockMutex(thread->drawmutex);
813         }
814         dpsoftrast.commandpool.usedcommands = usedcommands;
815 }
816
817 #define DPSOFTRAST_ALIGNCOMMAND(size) \
818         ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
819 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
820         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
821
822 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
823 {
824         DPSOFTRAST_Command *command;
825         int freecommand = dpsoftrast.commandpool.freecommand;
826         int usedcommands = dpsoftrast.commandpool.usedcommands;
827         int extra = sizeof(DPSOFTRAST_Command);
828         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
829                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
830         if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
831         {
832                 if (dpsoftrast.usethreads)
833                         DPSOFTRAST_Draw_FreeCommandPool(size + extra);
834                 else
835                         DPSOFTRAST_Draw_FlushThreads();
836                 freecommand = dpsoftrast.commandpool.freecommand;
837                 usedcommands = dpsoftrast.commandpool.usedcommands;
838         }
839         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
840         {
841                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
842                 command->opcode = DPSOFTRAST_OPCODE_Reset;
843                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
844                 freecommand = 0;
845         }
846         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
847         command->opcode = opcode;
848         command->commandsize = size;
849         freecommand += size;
850         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
851                 freecommand = 0;
852         dpsoftrast.commandpool.freecommand = freecommand;
853         dpsoftrast.commandpool.usedcommands = usedcommands + size;
854         return command;
855 }
856
857 static void DPSOFTRAST_UndoCommand(int size)
858 {
859         int freecommand = dpsoftrast.commandpool.freecommand;
860         int usedcommands = dpsoftrast.commandpool.usedcommands;
861         freecommand -= size;
862         if (freecommand < 0)
863                 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
864         usedcommands -= size;
865         dpsoftrast.commandpool.freecommand = freecommand;
866         dpsoftrast.commandpool.usedcommands = usedcommands;
867 }
868                 
869 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
870 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
871 {
872         thread->viewport[0] = command->x;
873         thread->viewport[1] = command->y;
874         thread->viewport[2] = command->width;
875         thread->viewport[3] = command->height;
876         thread->validate |= DPSOFTRAST_VALIDATE_FB;
877 }
878 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
879 {
880         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
881         command->x = x;
882         command->y = y;
883         command->width = width;
884         command->height = height;
885
886         dpsoftrast.viewport[0] = x;
887         dpsoftrast.viewport[1] = y;
888         dpsoftrast.viewport[2] = width;
889         dpsoftrast.viewport[3] = height;
890         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
891 }
892
893 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
894 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
895 {
896         int i, x1, y1, x2, y2, w, h, x, y;
897         int miny1 = thread->miny1;
898         int maxy1 = thread->maxy1;
899         int miny2 = thread->miny2;
900         int maxy2 = thread->maxy2;
901         int bandy;
902         unsigned int *p;
903         unsigned int c;
904         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
905         x1 = thread->fb_scissor[0];
906         y1 = thread->fb_scissor[1];
907         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
908         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
909         if (y1 < miny1) y1 = miny1;
910         if (y2 > maxy2) y2 = maxy2;
911         w = x2 - x1;
912         h = y2 - y1;
913         if (w < 1 || h < 1)
914                 return;
915         // FIXME: honor fb_colormask?
916         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
917         for (i = 0;i < 4;i++)
918         {
919                 if (!dpsoftrast.fb_colorpixels[i])
920                         continue;
921                 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
922                 for (;y < bandy;y++)
923                 {
924                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
925                         for (x = x1;x < x2;x++)
926                                 p[x] = c;
927                 }
928         }
929 }
930 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
931 {
932         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
933         command->r = r;
934         command->g = g;
935         command->b = b;
936         command->a = a;
937 }
938
939 DEFCOMMAND(3, ClearDepth, float depth;)
940 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
941 {
942         int x1, y1, x2, y2, w, h, x, y;
943         int miny1 = thread->miny1;
944         int maxy1 = thread->maxy1;
945         int miny2 = thread->miny2;
946         int maxy2 = thread->maxy2;
947         int bandy;
948         unsigned int *p;
949         unsigned int c;
950         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
951         x1 = thread->fb_scissor[0];
952         y1 = thread->fb_scissor[1];
953         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
954         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
955         if (y1 < miny1) y1 = miny1;
956         if (y2 > maxy2) y2 = maxy2;
957         w = x2 - x1;
958         h = y2 - y1;
959         if (w < 1 || h < 1)
960                 return;
961         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
962         for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
963         for (;y < bandy;y++)
964         {
965                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
966                 for (x = x1;x < x2;x++)
967                         p[x] = c;
968         }
969 }
970 void DPSOFTRAST_ClearDepth(float d)
971 {
972         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
973         command->depth = d;
974 }
975
976 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
977 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
978 {
979         thread->colormask[0] = command->r != 0;
980         thread->colormask[1] = command->g != 0;
981         thread->colormask[2] = command->b != 0;
982         thread->colormask[3] = command->a != 0;
983         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
984 }
985 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
986 {
987         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
988         command->r = r;
989         command->g = g;
990         command->b = b;
991         command->a = a;
992 }
993
994 DEFCOMMAND(5, DepthTest, int enable;)
995 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
996 {
997         thread->depthtest = command->enable;
998         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
999 }
1000 void DPSOFTRAST_DepthTest(int enable)
1001 {
1002         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1003         command->enable = enable;
1004 }
1005
1006 DEFCOMMAND(6, ScissorTest, int enable;)
1007 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1008 {
1009         thread->scissortest = command->enable;
1010         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1011 }
1012 void DPSOFTRAST_ScissorTest(int enable)
1013 {
1014         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1015         command->enable = enable;
1016 }
1017
1018 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1019 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1020 {
1021         thread->scissor[0] = command->x;
1022         thread->scissor[1] = command->y;
1023         thread->scissor[2] = command->width;
1024         thread->scissor[3] = command->height;
1025         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1026 }
1027 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1028 {
1029         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1030         command->x = x;
1031         command->y = y;
1032         command->width = width;
1033         command->height = height;
1034 }
1035
1036 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1037 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1038 {
1039         thread->blendfunc[0] = command->sfactor;
1040         thread->blendfunc[1] = command->dfactor;
1041         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1042 }
1043 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1044 {
1045         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1046         command->sfactor = sfactor;
1047         command->dfactor = dfactor;
1048 }
1049
1050 DEFCOMMAND(9, BlendSubtract, int enable;)
1051 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1052 {
1053         thread->blendsubtract = command->enable;
1054         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1055 }
1056 void DPSOFTRAST_BlendSubtract(int enable)
1057 {
1058         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1059         command->enable = enable;
1060 }
1061
1062 DEFCOMMAND(10, DepthMask, int enable;)
1063 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1064 {
1065         thread->depthmask = command->enable;
1066 }
1067 void DPSOFTRAST_DepthMask(int enable)
1068 {
1069         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1070         command->enable = enable;
1071 }
1072
1073 DEFCOMMAND(11, DepthFunc, int func;)
1074 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1075 {
1076         thread->depthfunc = command->func;
1077 }
1078 void DPSOFTRAST_DepthFunc(int func)
1079 {
1080         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1081         command->func = func;
1082 }
1083
1084 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1085 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1086 {
1087         thread->depthrange[0] = command->nearval;
1088         thread->depthrange[1] = command->farval;
1089 }
1090 void DPSOFTRAST_DepthRange(float nearval, float farval)
1091 {
1092         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1093         command->nearval = nearval;
1094         command->farval = farval;
1095 }
1096
1097 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1098 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1099 {
1100         thread->polygonoffset[0] = command->alongnormal;
1101         thread->polygonoffset[1] = command->intoview;
1102 }
1103 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1104 {
1105         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1106         command->alongnormal = alongnormal;
1107         command->intoview = intoview;
1108 }
1109
1110 DEFCOMMAND(14, CullFace, int mode;)
1111 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1112 {
1113         thread->cullface = command->mode;
1114 }
1115 void DPSOFTRAST_CullFace(int mode)
1116 {
1117         DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1118         command->mode = mode;
1119 }
1120
1121 DEFCOMMAND(15, AlphaTest, int enable;)
1122 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1123 {
1124         thread->alphatest = command->enable;
1125 }
1126 void DPSOFTRAST_AlphaTest(int enable)
1127 {
1128         DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1129         command->enable = enable;
1130 }
1131
1132 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1133 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1134 {
1135         thread->alphafunc = command->func;
1136         thread->alphavalue = command->ref;
1137 }
1138 void DPSOFTRAST_AlphaFunc(int func, float ref)
1139 {
1140         DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1141         command->func = func;
1142         command->ref = ref;
1143 }
1144
1145 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1146 {
1147         dpsoftrast.color[0] = r;
1148         dpsoftrast.color[1] = g;
1149         dpsoftrast.color[2] = b;
1150         dpsoftrast.color[3] = a;
1151 }
1152
1153 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1154 {
1155         int outstride = blockwidth * 4;
1156         int instride = dpsoftrast.fb_width * 4;
1157         int bx1 = blockx;
1158         int by1 = blocky;
1159         int bx2 = blockx + blockwidth;
1160         int by2 = blocky + blockheight;
1161         int bw;
1162         int x;
1163         int y;
1164         unsigned char *inpixels;
1165         unsigned char *b;
1166         unsigned char *o;
1167         DPSOFTRAST_Flush();
1168         if (bx1 < 0) bx1 = 0;
1169         if (by1 < 0) by1 = 0;
1170         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1171         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1172         bw = bx2 - bx1;
1173         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1174         if (dpsoftrast.bigendian)
1175         {
1176                 for (y = by1;y < by2;y++)
1177                 {
1178                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1179                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1180                         for (x = bx1;x < bx2;x++)
1181                         {
1182                                 o[0] = b[3];
1183                                 o[1] = b[2];
1184                                 o[2] = b[1];
1185                                 o[3] = b[0];
1186                                 o += 4;
1187                                 b += 4;
1188                         }
1189                 }
1190         }
1191         else
1192         {
1193                 for (y = by1;y < by2;y++)
1194                 {
1195                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1196                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1197                         memcpy(o, b, bw*4);
1198                 }
1199         }
1200
1201 }
1202 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1203 {
1204         int tx1 = tx;
1205         int ty1 = ty;
1206         int tx2 = tx + width;
1207         int ty2 = ty + height;
1208         int sx1 = sx;
1209         int sy1 = sy;
1210         int sx2 = sx + width;
1211         int sy2 = sy + height;
1212         int swidth;
1213         int sheight;
1214         int twidth;
1215         int theight;
1216         int sw;
1217         int sh;
1218         int tw;
1219         int th;
1220         int y;
1221         unsigned int *spixels;
1222         unsigned int *tpixels;
1223         DPSOFTRAST_Texture *texture;
1224         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1225         if (mip < 0 || mip >= texture->mipmaps) return;
1226         DPSOFTRAST_Flush();
1227         spixels = dpsoftrast.fb_colorpixels[0];
1228         swidth = dpsoftrast.fb_width;
1229         sheight = dpsoftrast.fb_height;
1230         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1231         twidth = texture->mipmap[mip][2];
1232         theight = texture->mipmap[mip][3];
1233         if (tx1 < 0) tx1 = 0;
1234         if (ty1 < 0) ty1 = 0;
1235         if (tx2 > twidth) tx2 = twidth;
1236         if (ty2 > theight) ty2 = theight;
1237         if (sx1 < 0) sx1 = 0;
1238         if (sy1 < 0) sy1 = 0;
1239         if (sx2 > swidth) sx2 = swidth;
1240         if (sy2 > sheight) sy2 = sheight;
1241         tw = tx2 - tx1;
1242         th = ty2 - ty1;
1243         sw = sx2 - sx1;
1244         sh = sy2 - sy1;
1245         if (tw > sw) tw = sw;
1246         if (th > sh) th = sh;
1247         if (tw < 1 || th < 1)
1248                 return;
1249         sy1 = sheight - 1 - sy1;
1250         for (y = 0;y < th;y++)
1251                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1252         if (texture->mipmaps > 1)
1253                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1254 }
1255
1256 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1257 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1258 {
1259         if (thread->texbound[command->unitnum])
1260                 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1261         thread->texbound[command->unitnum] = command->texture;
1262 }
1263 void DPSOFTRAST_SetTexture(int unitnum, int index)
1264 {
1265         DPSOFTRAST_Command_SetTexture *command;
1266         DPSOFTRAST_Texture *texture;
1267         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1268         {
1269                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1270                 return;
1271         }
1272         texture = DPSOFTRAST_Texture_GetByIndex(index);
1273         if (index && !texture)
1274         {
1275                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1276                 return;
1277         }
1278
1279         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1280         command->unitnum = unitnum;
1281         command->texture = texture;
1282
1283         dpsoftrast.texbound[unitnum] = texture;
1284         ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1285 }
1286
1287 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1288 {
1289         dpsoftrast.pointer_vertex3f = vertex3f;
1290         dpsoftrast.stride_vertex = stride;
1291 }
1292 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1293 {
1294         dpsoftrast.pointer_color4f = color4f;
1295         dpsoftrast.pointer_color4ub = NULL;
1296         dpsoftrast.stride_color = stride;
1297 }
1298 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1299 {
1300         dpsoftrast.pointer_color4f = NULL;
1301         dpsoftrast.pointer_color4ub = color4ub;
1302         dpsoftrast.stride_color = stride;
1303 }
1304 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1305 {
1306         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1307         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1308         dpsoftrast.stride_texcoord[unitnum] = stride;
1309 }
1310
1311 DEFCOMMAND(18, SetShader, int mode; int permutation;)
1312 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1313 {
1314         thread->shader_mode = command->mode;
1315         thread->shader_permutation = command->permutation;
1316 }
1317 void DPSOFTRAST_SetShader(int mode, int permutation)
1318 {
1319         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1320         command->mode = mode;
1321         command->permutation = permutation;
1322
1323         dpsoftrast.shader_mode = mode;
1324         dpsoftrast.shader_permutation = permutation;
1325 }
1326
1327 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1328 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1329 {
1330         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1331 }
1332 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1333 {
1334         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1335         command->index = index;
1336         command->val[0] = v0;
1337         command->val[1] = v1;
1338         command->val[2] = v2;
1339         command->val[3] = v3;
1340
1341         dpsoftrast.uniform4f[index*4+0] = v0;
1342         dpsoftrast.uniform4f[index*4+1] = v1;
1343         dpsoftrast.uniform4f[index*4+2] = v2;
1344         dpsoftrast.uniform4f[index*4+3] = v3;
1345 }
1346 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1347 {
1348         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1349         command->index = index;
1350         memcpy(command->val, v, sizeof(command->val));
1351
1352         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1353 }
1354
1355 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1356 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1357 {
1358         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1359 }
1360 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1361 {
1362 #ifdef SSE2_PRESENT
1363         int i, index;
1364         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1365         {
1366                 __m128 m0, m1, m2, m3;
1367                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1368                 command->index = (DPSOFTRAST_UNIFORM)index;
1369                 if (((size_t)v)&(ALIGN_SIZE-1))
1370                 {
1371                         m0 = _mm_loadu_ps(v);
1372                         m1 = _mm_loadu_ps(v+4);
1373                         m2 = _mm_loadu_ps(v+8);
1374                         m3 = _mm_loadu_ps(v+12);
1375                 }
1376                 else
1377                 {
1378                         m0 = _mm_load_ps(v);
1379                         m1 = _mm_load_ps(v+4);
1380                         m2 = _mm_load_ps(v+8);
1381                         m3 = _mm_load_ps(v+12);
1382                 }
1383                 if (transpose)
1384                 {
1385                         __m128 t0, t1, t2, t3;
1386                         t0 = _mm_unpacklo_ps(m0, m1);
1387                         t1 = _mm_unpacklo_ps(m2, m3);
1388                         t2 = _mm_unpackhi_ps(m0, m1);
1389                         t3 = _mm_unpackhi_ps(m2, m3);
1390                         m0 = _mm_movelh_ps(t0, t1);
1391                         m1 = _mm_movehl_ps(t1, t0);
1392                         m2 = _mm_movelh_ps(t2, t3);
1393                         m3 = _mm_movehl_ps(t3, t2);                     
1394                 }
1395                 _mm_store_ps(command->val, m0);
1396                 _mm_store_ps(command->val+4, m1);
1397                 _mm_store_ps(command->val+8, m2);
1398                 _mm_store_ps(command->val+12, m3);
1399                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1400                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1401                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1402                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1403         }
1404 #endif
1405 }
1406
1407 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1408 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1409 {
1410         thread->uniform1i[command->index] = command->val;
1411 }
1412 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1413 {
1414         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1415         command->index = index;
1416         command->val = i0;
1417
1418         dpsoftrast.uniform1i[command->index] = i0;
1419 }
1420
1421 #ifdef SSE2_PRESENT
1422 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1423 {
1424         float *end = dst + size*4;
1425         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1426         {
1427                 while (dst < end)
1428                 {
1429                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1430                         dst += 4;
1431                         src += stride;
1432                 }
1433         }
1434         else
1435         {
1436                 while (dst < end)
1437                 {
1438                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1439                         dst += 4;
1440                         src += stride;
1441                 }
1442         }
1443 }
1444
1445 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1446 {
1447         float *end = dst + size*4;
1448         if (stride == sizeof(float[3]))
1449         {
1450                 float *end4 = dst + (size&~3)*4;        
1451                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1452                 {
1453                         while (dst < end4)
1454                         {
1455                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1456                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1457                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1458                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1459                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1460                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1461                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1462                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1463                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1464                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1465                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1466                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1467                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1468                                 dst += 16;
1469                                 src += 4*sizeof(float[3]);
1470                         }
1471                 }
1472                 else
1473                 {
1474                         while (dst < end4)
1475                         {
1476                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1477                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1478                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1479                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1480                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1481                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1482                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1483                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1484                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1485                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1486                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1487                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1488                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1489                                 dst += 16;
1490                                 src += 4*sizeof(float[3]);
1491                         }
1492                 }
1493         }
1494         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1495         {
1496                 while (dst < end)
1497                 {
1498                         __m128 v = _mm_loadu_ps((const float *)src);
1499                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1500                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1501                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1502                         _mm_store_ps(dst, v);
1503                         dst += 4;
1504                         src += stride;
1505                 }
1506         }
1507         else
1508         {
1509                 while (dst < end)
1510                 {
1511                         __m128 v = _mm_load_ps((const float *)src);
1512                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1513                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1514                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1515                         _mm_store_ps(dst, v);
1516                         dst += 4;
1517                         src += stride;
1518                 }
1519         }
1520 }
1521
1522 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1523 {
1524         float *end = dst + size*4;
1525         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1526         if (stride == sizeof(float[2]))
1527         {
1528                 float *end2 = dst + (size&~1)*4;
1529                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1530                 {
1531                         while (dst < end2)
1532                         {
1533                                 __m128 v = _mm_loadu_ps((const float *)src);
1534                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1535                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1536                                 dst += 8;
1537                                 src += 2*sizeof(float[2]);
1538                         }
1539                 }
1540                 else
1541                 {
1542                         while (dst < end2)
1543                         {
1544                                 __m128 v = _mm_load_ps((const float *)src);
1545                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1546                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1547                                 dst += 8;
1548                                 src += 2*sizeof(float[2]);
1549                         }
1550                 }
1551         }
1552         while (dst < end)
1553         {
1554                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1555                 dst += 4;
1556                 src += stride;
1557         }
1558 }
1559
1560 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1561 {
1562         float *end = dst + size*4;
1563         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1564         if (stride == sizeof(unsigned char[4]))
1565         {
1566                 float *end4 = dst + (size&~3)*4;
1567                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1568                 {
1569                         while (dst < end4)
1570                         {
1571                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1572                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1573                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1574                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1575                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1576                                 dst += 16;
1577                                 src += 4*sizeof(unsigned char[4]);
1578                         }
1579                 }
1580                 else
1581                 {
1582                         while (dst < end4)
1583                         {
1584                                 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1585                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1586                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1587                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1588                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1589                                 dst += 16;
1590                                 src += 4*sizeof(unsigned char[4]);
1591                         }
1592                 }
1593         }
1594         while (dst < end)
1595         {
1596                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1597                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1598                 dst += 4;
1599                 src += stride;
1600         }
1601 }
1602
1603 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1604 {
1605         float *end = dst + 4*size;
1606         __m128 v = _mm_loadu_ps(src);
1607         while (dst < end)
1608         {
1609                 _mm_store_ps(dst, v);
1610                 dst += 4;
1611         }
1612 }
1613 #endif
1614
1615 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1616 {
1617 #ifdef SSE2_PRESENT
1618         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1619         __m128 m0, m1, m2, m3;
1620         float *end;
1621         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1622         {
1623                 // fast case for identity matrix
1624                 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1625                 return;
1626         }
1627         end = out4f + numitems*4;
1628         m0 = _mm_loadu_ps(inmatrix16f);
1629         m1 = _mm_loadu_ps(inmatrix16f + 4);
1630         m2 = _mm_loadu_ps(inmatrix16f + 8);
1631         m3 = _mm_loadu_ps(inmatrix16f + 12);
1632         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1633         {
1634                 while (out4f < end)
1635                 {
1636                         __m128 v = _mm_loadu_ps(in4f);
1637                         _mm_store_ps(out4f,
1638                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1639                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1640                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1641                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1642                         out4f += 4;
1643                         in4f += 4;
1644                 }
1645         }
1646         else
1647         {
1648                 while (out4f < end)
1649                 {
1650                         __m128 v = _mm_load_ps(in4f);
1651                         _mm_store_ps(out4f,
1652                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1653                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1654                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1655                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1656                         out4f += 4;
1657                         in4f += 4;
1658                 }
1659         }
1660 #endif
1661 }
1662
1663 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1664 {
1665         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1666 }
1667
1668 #ifdef SSE2_PRESENT
1669 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1670 { \
1671         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1672         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1673         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1674         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1675 }
1676
1677 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1678 { \
1679         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1680         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1681         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1682         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1683 }
1684
1685 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1686 { \
1687         __m128 p = (in); \
1688         out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1689                                                   _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1690                                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1691                                                                                         _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1692 }
1693
1694 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, __m128 minpos, __m128 maxpos, __m128 viewportcenter, __m128 viewportscale, __m128 m0, __m128 m1, __m128 m2, __m128 m3)
1695 {
1696         int clipmask = 0xFF;
1697         __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1698         m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1699         m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1700         m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1701         m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1702         #define BBFRONT(k, pos) \
1703         { \
1704                 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1705                 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1706                 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1707                 { \
1708                         __m128 proj; \
1709                         clipmask &= ~(1<<k); \
1710                         proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1711                         minproj = _mm_min_ss(minproj, proj); \
1712                         maxproj = _mm_max_ss(maxproj, proj); \
1713                 } \
1714         }
1715         BBFRONT(0, minpos); 
1716         BBFRONT(1, _mm_move_ss(minpos, maxpos)); 
1717         BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1718         BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1719         BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1720         BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1721         BBFRONT(6, _mm_move_ss(maxpos, minpos)); 
1722         BBFRONT(7, maxpos);
1723         #define BBCLIP(k) \
1724         { \
1725                 if (clipmask&(1<<k)) \
1726                 { \
1727                         if (!(clipmask&(1<<(k^1)))) \
1728                         { \
1729                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1730                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1731                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1732                                 minproj = _mm_min_ss(minproj, proj); \
1733                                 maxproj = _mm_max_ss(maxproj, proj); \
1734                         } \
1735                         if (!(clipmask&(1<<(k^2)))) \
1736                         { \
1737                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1738                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1739                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1740                                 minproj = _mm_min_ss(minproj, proj); \
1741                                 maxproj = _mm_max_ss(maxproj, proj); \
1742                         } \
1743                         if (!(clipmask&(1<<(k^4)))) \
1744                         { \
1745                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1746                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1747                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1748                                 minproj = _mm_min_ss(minproj, proj); \
1749                                 maxproj = _mm_max_ss(maxproj, proj); \
1750                         } \
1751                 } \
1752         }
1753         BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1754         viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1755         viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1756         minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1757         maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1758         minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1759         maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1760         *starty = _mm_cvttss_si32(maxproj);
1761         *endy = _mm_cvttss_si32(minproj)+1;
1762         return clipmask;
1763 }
1764         
1765 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1766 {
1767         float *end = out4f + numitems*4;
1768         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1769         __m128 minpos, maxpos;
1770         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1771         {
1772                 minpos = maxpos = _mm_loadu_ps(in4f);
1773                 while (out4f < end)
1774                 {
1775                         __m128 v = _mm_loadu_ps(in4f);
1776                         minpos = _mm_min_ps(minpos, v);
1777                         maxpos = _mm_max_ps(maxpos, v);
1778                         _mm_store_ps(out4f, v);
1779                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1780                         _mm_store_ps(screen4f, v);
1781                         in4f += 4;
1782                         out4f += 4;
1783                         screen4f += 4;
1784                 }
1785         }
1786         else
1787         {
1788                 minpos = maxpos = _mm_load_ps(in4f);
1789                 while (out4f < end)
1790                 {
1791                         __m128 v = _mm_load_ps(in4f);
1792                         minpos = _mm_min_ps(minpos, v);
1793                         maxpos = _mm_max_ps(maxpos, v);
1794                         _mm_store_ps(out4f, v);
1795                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1796                         _mm_store_ps(screen4f, v);
1797                         in4f += 4;
1798                         out4f += 4;
1799                         screen4f += 4;
1800                 }
1801         }
1802         if (starty && endy) 
1803                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, 
1804                                         _mm_setr_ps(1.0f, 0.0f, 0.0f, 0.0f),
1805                                         _mm_setr_ps(0.0f, 1.0f, 0.0f, 0.0f),
1806                                         _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f),
1807                                         _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f));
1808         return 0;
1809 }
1810
1811 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1812 {
1813         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1814         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1815         float *end;
1816         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1817                 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1818         end = out4f + numitems*4;
1819         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1820         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1821         m0 = _mm_loadu_ps(inmatrix16f);
1822         m1 = _mm_loadu_ps(inmatrix16f + 4);
1823         m2 = _mm_loadu_ps(inmatrix16f + 8);
1824         m3 = _mm_loadu_ps(inmatrix16f + 12);
1825         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1826         {
1827                 minpos = maxpos = _mm_loadu_ps(in4f);
1828                 while (out4f < end)
1829                 {
1830                         __m128 v = _mm_loadu_ps(in4f);
1831                         minpos = _mm_min_ps(minpos, v);
1832                         maxpos = _mm_max_ps(maxpos, v);
1833                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1834                         _mm_store_ps(out4f, v);
1835                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1836                         _mm_store_ps(screen4f, v);
1837                         in4f += 4;
1838                         out4f += 4;
1839                         screen4f += 4;
1840                 }
1841         }
1842         else
1843         {
1844                 minpos = maxpos = _mm_load_ps(in4f);
1845                 while (out4f < end)
1846                 {
1847                         __m128 v = _mm_load_ps(in4f);
1848                         minpos = _mm_min_ps(minpos, v);
1849                         maxpos = _mm_max_ps(maxpos, v);
1850                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1851                         _mm_store_ps(out4f, v);
1852                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1853                         _mm_store_ps(screen4f, v);
1854                         in4f += 4;
1855                         out4f += 4;
1856                         screen4f += 4;
1857                 }
1858         }
1859         if (starty && endy) 
1860                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, m0, m1, m2, m3); 
1861         return 0;
1862 }
1863 #endif
1864
1865 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1866 {
1867 #ifdef SSE2_PRESENT
1868         float *outf = dpsoftrast.post_array4f[outarray];
1869         const unsigned char *inb;
1870         int firstvertex = dpsoftrast.firstvertex;
1871         int numvertices = dpsoftrast.numvertices;
1872         int stride;
1873         switch(inarray)
1874         {
1875         case DPSOFTRAST_ARRAY_POSITION:
1876                 stride = dpsoftrast.stride_vertex;
1877                 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1878                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1879                 break;
1880         case DPSOFTRAST_ARRAY_COLOR:
1881                 stride = dpsoftrast.stride_color;
1882                 if (dpsoftrast.pointer_color4f)
1883                 {
1884                         inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1885                         DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1886                 }
1887                 else if (dpsoftrast.pointer_color4ub)
1888                 {
1889                         stride = dpsoftrast.stride_color;
1890                         inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1891                         DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1892                 }
1893                 else
1894                 {
1895                         DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1896                 }
1897                 break;
1898         default:
1899                 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1900                 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1901                 {
1902                         inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1903                         switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1904                         {
1905                         case 2:
1906                                 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1907                                 break;
1908                         case 3:
1909                                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1910                                 break;
1911                         case 4:
1912                                 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1913                                 break;
1914                         }
1915                 }
1916                 break;
1917         }
1918         return outf;
1919 #else
1920         return NULL;
1921 #endif
1922 }
1923
1924 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1925 {
1926         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1927         DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1928         return data;
1929 }
1930
1931 #if 0
1932 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1933 {
1934 #ifdef SSE2_PRESENT
1935         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1936         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1937         return data;
1938 #else
1939         return NULL;
1940 #endif
1941 }
1942 #endif
1943
1944 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1945 {
1946 #ifdef SSE2_PRESENT
1947         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1948         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
1949         return data;
1950 #else
1951         return NULL;
1952 #endif
1953 }
1954
1955 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1956 {
1957         int x;
1958         int startx = span->startx;
1959         int endx = span->endx;
1960         float wslope = triangle->w[0];
1961         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1962         float endz = 1.0f / (w + wslope * startx);
1963         for (x = startx;x < endx;)
1964         {
1965                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1966                 float z = endz, dz;
1967                 if (nextsub >= endx) nextsub = endsub = endx-1;
1968                 endz = 1.0f / (w + wslope * nextsub);
1969                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1970                 for (; x <= endsub; x++, z += dz)
1971                         zf[x] = z;
1972         }
1973 }
1974
1975 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
1976 {
1977         int x;
1978         int startx = span->startx;
1979         int endx = span->endx;
1980         int d[4];
1981         float a, b;
1982         unsigned char * RESTRICT pixelmask = span->pixelmask;
1983         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1984         if (!pixel)
1985                 return;
1986         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
1987         // handle alphatest now (this affects depth writes too)
1988         if (thread->alphatest)
1989                 for (x = startx;x < endx;x++)
1990                         if (in4f[x*4+3] < 0.5f)
1991                                 pixelmask[x] = false;
1992         // FIXME: this does not handle bigendian
1993         switch(thread->fb_blendmode)
1994         {
1995         case DPSOFTRAST_BLENDMODE_OPAQUE:
1996                 for (x = startx;x < endx;x++)
1997                 {
1998                         if (!pixelmask[x])
1999                                 continue;
2000                         d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
2001                         d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
2002                         d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
2003                         d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
2004                         pixel[x*4+0] = d[0];
2005                         pixel[x*4+1] = d[1];
2006                         pixel[x*4+2] = d[2];
2007                         pixel[x*4+3] = d[3];
2008                 }
2009                 break;
2010         case DPSOFTRAST_BLENDMODE_ALPHA:
2011                 for (x = startx;x < endx;x++)
2012                 {
2013                         if (!pixelmask[x])
2014                                 continue;
2015                         a = in4f[x*4+3] * 255.0f;
2016                         b = 1.0f - in4f[x*4+3];
2017                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2018                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2019                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2020                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2021                         pixel[x*4+0] = d[0];
2022                         pixel[x*4+1] = d[1];
2023                         pixel[x*4+2] = d[2];
2024                         pixel[x*4+3] = d[3];
2025                 }
2026                 break;
2027         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2028                 for (x = startx;x < endx;x++)
2029                 {
2030                         if (!pixelmask[x])
2031                                 continue;
2032                         a = in4f[x*4+3] * 255.0f;
2033                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2034                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2035                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2036                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2037                         pixel[x*4+0] = d[0];
2038                         pixel[x*4+1] = d[1];
2039                         pixel[x*4+2] = d[2];
2040                         pixel[x*4+3] = d[3];
2041                 }
2042                 break;
2043         case DPSOFTRAST_BLENDMODE_ADD:
2044                 for (x = startx;x < endx;x++)
2045                 {
2046                         if (!pixelmask[x])
2047                                 continue;
2048                         d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2049                         d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2050                         d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2051                         d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2052                         pixel[x*4+0] = d[0];
2053                         pixel[x*4+1] = d[1];
2054                         pixel[x*4+2] = d[2];
2055                         pixel[x*4+3] = d[3];
2056                 }
2057                 break;
2058         case DPSOFTRAST_BLENDMODE_INVMOD:
2059                 for (x = startx;x < endx;x++)
2060                 {
2061                         if (!pixelmask[x])
2062                                 continue;
2063                         d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2064                         d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2065                         d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2066                         d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2067                         pixel[x*4+0] = d[0];
2068                         pixel[x*4+1] = d[1];
2069                         pixel[x*4+2] = d[2];
2070                         pixel[x*4+3] = d[3];
2071                 }
2072                 break;
2073         case DPSOFTRAST_BLENDMODE_MUL:
2074                 for (x = startx;x < endx;x++)
2075                 {
2076                         if (!pixelmask[x])
2077                                 continue;
2078                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2079                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2080                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2081                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2082                         pixel[x*4+0] = d[0];
2083                         pixel[x*4+1] = d[1];
2084                         pixel[x*4+2] = d[2];
2085                         pixel[x*4+3] = d[3];
2086                 }
2087                 break;
2088         case DPSOFTRAST_BLENDMODE_MUL2:
2089                 for (x = startx;x < endx;x++)
2090                 {
2091                         if (!pixelmask[x])
2092                                 continue;
2093                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2094                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2095                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2096                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2097                         pixel[x*4+0] = d[0];
2098                         pixel[x*4+1] = d[1];
2099                         pixel[x*4+2] = d[2];
2100                         pixel[x*4+3] = d[3];
2101                 }
2102                 break;
2103         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2104                 for (x = startx;x < endx;x++)
2105                 {
2106                         if (!pixelmask[x])
2107                                 continue;
2108                         a = in4f[x*4+3] * -255.0f;
2109                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2110                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2111                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2112                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2113                         pixel[x*4+0] = d[0];
2114                         pixel[x*4+1] = d[1];
2115                         pixel[x*4+2] = d[2];
2116                         pixel[x*4+3] = d[3];
2117                 }
2118                 break;
2119         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2120                 for (x = startx;x < endx;x++)
2121                 {
2122                         if (!pixelmask[x])
2123                                 continue;
2124                         a = 255.0f;
2125                         b = 1.0f - in4f[x*4+3];
2126                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2127                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2128                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2129                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2130                         pixel[x*4+0] = d[0];
2131                         pixel[x*4+1] = d[1];
2132                         pixel[x*4+2] = d[2];
2133                         pixel[x*4+3] = d[3];
2134                 }
2135                 break;
2136         case DPSOFTRAST_BLENDMODE_INVADD:
2137                 for (x = startx;x < endx;x++)
2138                 {
2139                         if (!pixelmask[x])
2140                                 continue;
2141                         d[0] = (int)((255.0f-pixel[x*4+2])*in4f[x*4+0] + pixel[x*4+2]);if (d[0] > 255) d[0] = 255;
2142                         d[1] = (int)((255.0f-pixel[x*4+1])*in4f[x*4+1] + pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2143                         d[2] = (int)((255.0f-pixel[x*4+0])*in4f[x*4+2] + pixel[x*4+0]);if (d[2] > 255) d[2] = 255;
2144                         d[3] = (int)((255.0f-pixel[x*4+3])*in4f[x*4+3] + pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2145                         pixel[x*4+0] = d[0];
2146                         pixel[x*4+1] = d[1];
2147                         pixel[x*4+2] = d[2];
2148                         pixel[x*4+3] = d[3];
2149                 }
2150                 break;
2151         }
2152 }
2153
2154 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2155 {
2156 #ifdef SSE2_PRESENT
2157         int x;
2158         int startx = span->startx;
2159         int endx = span->endx;
2160         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2161         unsigned char * RESTRICT pixelmask = span->pixelmask;
2162         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2163         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2164         if (!pixel)
2165                 return;
2166         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2167         pixeli += span->y * dpsoftrast.fb_width + span->x;
2168         // handle alphatest now (this affects depth writes too)
2169         if (thread->alphatest)
2170                 for (x = startx;x < endx;x++)
2171                         if (in4ub[x*4+3] < 0.5f)
2172                                 pixelmask[x] = false;
2173         // FIXME: this does not handle bigendian
2174         switch(thread->fb_blendmode)
2175         {
2176         case DPSOFTRAST_BLENDMODE_OPAQUE:
2177                 for (x = startx;x + 4 <= endx;)
2178                 {
2179                         if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2180                         {
2181                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2182                                 x += 4;
2183                         }
2184                         else
2185                         {
2186                                 if (pixelmask[x])
2187                                         pixeli[x] = ini[x];
2188                                 x++;
2189                         }
2190                 }
2191                 for (;x < endx;x++)
2192                         if (pixelmask[x])
2193                                 pixeli[x] = ini[x];
2194                 break;
2195         case DPSOFTRAST_BLENDMODE_ALPHA:
2196         #define FINISHBLEND(blend2, blend1) \
2197                 for (x = startx;x + 1 < endx;x += 2) \
2198                 { \
2199                         __m128i src, dst; \
2200                         switch (*(const unsigned short*)&pixelmask[x]) \
2201                         { \
2202                         case 0x0101: \
2203                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2204                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2205                                 blend2; \
2206                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2207                                 continue; \
2208                         case 0x0100: \
2209                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2210                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2211                                 blend1; \
2212                                 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst));  \
2213                                 continue; \
2214                         case 0x0001: \
2215                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2216                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2217                                 blend1; \
2218                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2219                                 continue; \
2220                         } \
2221                         break; \
2222                 } \
2223                 for(;x < endx; x++) \
2224                 { \
2225                         __m128i src, dst; \
2226                         if (!pixelmask[x]) \
2227                                 continue; \
2228                         src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2229                         dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2230                         blend1; \
2231                         pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2232                 }
2233
2234                 FINISHBLEND({
2235                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2236                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2237                 }, {
2238                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2239                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2240                 });
2241                 break;
2242         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2243                 FINISHBLEND({
2244                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2245                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2246                 }, {
2247                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2248                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2249                 });
2250                 break;
2251         case DPSOFTRAST_BLENDMODE_ADD:
2252                 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2253                 break;
2254         case DPSOFTRAST_BLENDMODE_INVMOD:
2255                 FINISHBLEND({
2256                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2257                 }, {
2258                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2259                 });
2260                 break;
2261         case DPSOFTRAST_BLENDMODE_MUL:
2262                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2263                 break;
2264         case DPSOFTRAST_BLENDMODE_MUL2:
2265                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2266                 break;
2267         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2268                 FINISHBLEND({
2269                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2270                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2271                 }, {
2272                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2273                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2274                 });
2275                 break;
2276         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2277                 FINISHBLEND({
2278                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2279                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2280                 }, {
2281                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2282                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2283                 });
2284                 break;
2285         case DPSOFTRAST_BLENDMODE_INVADD:
2286                 FINISHBLEND({
2287                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2288                 }, {
2289                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2290                 });
2291                 break;
2292         }
2293 #endif
2294 }
2295
2296 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2297 {
2298         int x;
2299         int startx = span->startx;
2300         int endx = span->endx;
2301         int flags;
2302         float c[4];
2303         float data[4];
2304         float slope[4];
2305         float tc[2], endtc[2];
2306         float tcscale[2];
2307         unsigned int tci[2];
2308         unsigned int tci1[2];
2309         unsigned int tcimin[2];
2310         unsigned int tcimax[2];
2311         int tciwrapmask[2];
2312         int tciwidth;
2313         int filter;
2314         int mip;
2315         const unsigned char * RESTRICT pixelbase;
2316         const unsigned char * RESTRICT pixel[4];
2317         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2318         // if no texture is bound, just fill it with white
2319         if (!texture)
2320         {
2321                 for (x = startx;x < endx;x++)
2322                 {
2323                         out4f[x*4+0] = 1.0f;
2324                         out4f[x*4+1] = 1.0f;
2325                         out4f[x*4+2] = 1.0f;
2326                         out4f[x*4+3] = 1.0f;
2327                 }
2328                 return;
2329         }
2330         mip = triangle->mip[texunitindex];
2331         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2332         // if this mipmap of the texture is 1 pixel, just fill it with that color
2333         if (texture->mipmap[mip][1] == 4)
2334         {
2335                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2336                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2337                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2338                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2339                 for (x = startx;x < endx;x++)
2340                 {
2341                         out4f[x*4+0] = c[0];
2342                         out4f[x*4+1] = c[1];
2343                         out4f[x*4+2] = c[2];
2344                         out4f[x*4+3] = c[3];
2345                 }
2346                 return;
2347         }
2348         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2349         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2350         flags = texture->flags;
2351         tcscale[0] = texture->mipmap[mip][2];
2352         tcscale[1] = texture->mipmap[mip][3];
2353         tciwidth = texture->mipmap[mip][2];
2354         tcimin[0] = 0;
2355         tcimin[1] = 0;
2356         tcimax[0] = texture->mipmap[mip][2]-1;
2357         tcimax[1] = texture->mipmap[mip][3]-1;
2358         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2359         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2360         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
2361         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
2362         for (x = startx;x < endx;)
2363         {
2364                 unsigned int subtc[2];
2365                 unsigned int substep[2];
2366                 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2367                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2368                 if (nextsub >= endx)
2369                 {
2370                         nextsub = endsub = endx-1;      
2371                         if (x < nextsub) subscale = 65536.0f / (nextsub - x);
2372                 }
2373                 tc[0] = endtc[0];
2374                 tc[1] = endtc[1];
2375                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
2376                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
2377                 substep[0] = (endtc[0] - tc[0]) * subscale;
2378                 substep[1] = (endtc[1] - tc[1]) * subscale;
2379                 subtc[0] = tc[0] * (1<<16);
2380                 subtc[1] = tc[1] * (1<<16);
2381                 if (filter)
2382                 {
2383                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2384                         {
2385                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2386                                 {
2387                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2388                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2389                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2390                                         tci[0] = subtc[0]>>16;
2391                                         tci[1] = subtc[1]>>16;
2392                                         tci1[0] = tci[0] + 1;
2393                                         tci1[1] = tci[1] + 1;
2394                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2395                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2396                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2397                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2398                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2399                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2400                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2401                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2402                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2403                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2404                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2405                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2406                                         out4f[x*4+0] = c[0];
2407                                         out4f[x*4+1] = c[1];
2408                                         out4f[x*4+2] = c[2];
2409                                         out4f[x*4+3] = c[3];
2410                                 }
2411                         }
2412                         else
2413                         {
2414                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2415                                 {
2416                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2417                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2418                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2419                                         tci[0] = subtc[0]>>16;
2420                                         tci[1] = subtc[1]>>16;
2421                                         tci1[0] = tci[0] + 1;
2422                                         tci1[1] = tci[1] + 1;
2423                                         tci[0] &= tciwrapmask[0];
2424                                         tci[1] &= tciwrapmask[1];
2425                                         tci1[0] &= tciwrapmask[0];
2426                                         tci1[1] &= tciwrapmask[1];
2427                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2428                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2429                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2430                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2431                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2432                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2433                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2434                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2435                                         out4f[x*4+0] = c[0];
2436                                         out4f[x*4+1] = c[1];
2437                                         out4f[x*4+2] = c[2];
2438                                         out4f[x*4+3] = c[3];
2439                                 }
2440                         }
2441                 }
2442                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2443                 {
2444                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2445                         {
2446                                 tci[0] = subtc[0]>>16;
2447                                 tci[1] = subtc[1]>>16;
2448                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2449                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2450                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2451                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2452                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2453                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2454                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2455                                 out4f[x*4+0] = c[0];
2456                                 out4f[x*4+1] = c[1];
2457                                 out4f[x*4+2] = c[2];
2458                                 out4f[x*4+3] = c[3];
2459                         }
2460                 }
2461                 else
2462                 {
2463                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2464                         {
2465                                 tci[0] = subtc[0]>>16;
2466                                 tci[1] = subtc[1]>>16;
2467                                 tci[0] &= tciwrapmask[0];
2468                                 tci[1] &= tciwrapmask[1];
2469                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2470                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2471                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2472                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2473                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2474                                 out4f[x*4+0] = c[0];
2475                                 out4f[x*4+1] = c[1];
2476                                 out4f[x*4+2] = c[2];
2477                                 out4f[x*4+3] = c[3];
2478                         }
2479                 }
2480         }
2481 }
2482
2483 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2484 {
2485 #ifdef SSE2_PRESENT
2486         int x;
2487         int startx = span->startx;
2488         int endx = span->endx;
2489         int flags;
2490         __m128 data, slope, tcscale;
2491         __m128i tcsize, tcmask, tcoffset, tcmax;
2492         __m128 tc, endtc;
2493         __m128i subtc, substep, endsubtc;
2494         int filter;
2495         int mip;
2496         unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2497         const unsigned char * RESTRICT pixelbase;
2498         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2499         // if no texture is bound, just fill it with white
2500         if (!texture)
2501         {
2502                 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2503                 return;
2504         }
2505         mip = triangle->mip[texunitindex];
2506         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2507         // if this mipmap of the texture is 1 pixel, just fill it with that color
2508         if (texture->mipmap[mip][1] == 4)
2509         {
2510                 unsigned int k = *((const unsigned int *)pixelbase);
2511                 for (x = startx;x < endx;x++)
2512                         outi[x] = k;
2513                 return;
2514         }
2515         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2516         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2517         flags = texture->flags;
2518         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2519         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2520         tcscale = _mm_cvtepi32_ps(tcsize);
2521         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2522         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2523         endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
2524         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2525         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2526         tcmax = _mm_packs_epi32(tcmask, tcmask);
2527         for (x = startx;x < endx;)
2528         {
2529                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2530                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2531                 if (nextsub >= endx)
2532                 {
2533                         nextsub = endsub = endx-1;
2534                         if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2535                 }       
2536                 tc = endtc;
2537                 subtc = endsubtc;
2538                 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
2539                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2540                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2541                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2542                 substep = _mm_slli_epi32(substep, 1);
2543                 if (filter)
2544                 {
2545                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2546                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2547                         {
2548                                 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2549                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2550                                 {
2551                                         const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;                   
2552                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2553                                         tci = _mm_madd_epi16(tci, tcoffset);
2554                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2555                                         ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2556                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2557                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2558                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2559                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2560                                         fracm = _mm_srli_epi16(subtc, 1);
2561                                         pix1 = _mm_add_epi16(pix1,
2562                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2563                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2564                                         pix3 = _mm_add_epi16(pix3,
2565                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2566                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2567                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2568                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2569                                         pix2 = _mm_add_epi16(pix2,
2570                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2571                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2572                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2573                                 }
2574                                 if (x <= endsub)
2575                                 {
2576                                         const unsigned char * RESTRICT ptr1;
2577                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2578                                         tci = _mm_madd_epi16(tci, tcoffset);
2579                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2580                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2581                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2582                                         fracm = _mm_srli_epi16(subtc, 1);
2583                                         pix1 = _mm_add_epi16(pix1,
2584                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2585                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2586                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2587                                         pix1 = _mm_add_epi16(pix1,
2588                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2589                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2590                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2591                                         x++;
2592                                 }
2593                         }
2594                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2595                         {
2596                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2597                                 {
2598                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2599                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2600                                         tci = _mm_madd_epi16(tci, tcoffset);
2601                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2602                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2603                                                                                         _mm_setzero_si128());
2604                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2605                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2606                                                                                         _mm_setzero_si128());
2607                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2608                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2609                                         tci = _mm_madd_epi16(tci, tcoffset);
2610                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2611                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2612                                                                                         _mm_setzero_si128());
2613                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2614                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2615                                                                                         _mm_setzero_si128());
2616                                         fracm = _mm_srli_epi16(subtc, 1);
2617                                         pix1 = _mm_add_epi16(pix1,
2618                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2619                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2620                                         pix3 = _mm_add_epi16(pix3,
2621                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2622                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2623                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2624                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2625                                         pix2 = _mm_add_epi16(pix2,
2626                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2627                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2628                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2629                                 }
2630                                 if (x <= endsub)
2631                                 {
2632                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2633                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2634                                         tci = _mm_madd_epi16(tci, tcoffset);
2635                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2636                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2637                                                                                         _mm_setzero_si128());
2638                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2639                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2640                                                                                         _mm_setzero_si128());
2641                                         fracm = _mm_srli_epi16(subtc, 1);
2642                                         pix1 = _mm_add_epi16(pix1,
2643                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2644                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2645                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2646                                         pix1 = _mm_add_epi16(pix1,
2647                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2648                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2649                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2650                                         x++;
2651                                 }
2652                         }
2653                         else
2654                         {
2655                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2656                                 {
2657                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2658                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2659                                         tci = _mm_madd_epi16(tci, tcoffset);
2660                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2661                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2662                                                                                         _mm_setzero_si128());
2663                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2664                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2665                                                                                         _mm_setzero_si128());
2666                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2667                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2668                                         tci = _mm_madd_epi16(tci, tcoffset);
2669                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2670                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2671                                                                                         _mm_setzero_si128());
2672                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2673                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2674                                                                                         _mm_setzero_si128());
2675                                         fracm = _mm_srli_epi16(subtc, 1);
2676                                         pix1 = _mm_add_epi16(pix1,
2677                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2678                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2679                                         pix3 = _mm_add_epi16(pix3,
2680                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2681                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2682                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2683                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2684                                         pix2 = _mm_add_epi16(pix2,
2685                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2686                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2687                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2688                                 }
2689                                 if (x <= endsub)
2690                                 {
2691                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2692                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2693                                         tci = _mm_madd_epi16(tci, tcoffset);
2694                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2695                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2696                                                                                         _mm_setzero_si128());
2697                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2698                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2699                                                                                         _mm_setzero_si128());
2700                                         fracm = _mm_srli_epi16(subtc, 1);
2701                                         pix1 = _mm_add_epi16(pix1,
2702                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2703                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2704                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2705                                         pix1 = _mm_add_epi16(pix1,
2706                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2707                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2708                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2709                                         x++;
2710                                 }
2711                         }
2712                 }
2713                 else
2714                 {
2715                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2716                         {
2717                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2718                                 {
2719                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2720                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2721                                         tci = _mm_madd_epi16(tci, tcoffset);
2722                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2723                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2724                                 }
2725                                 if (x <= endsub)
2726                                 {
2727                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2728                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2729                                         tci = _mm_madd_epi16(tci, tcoffset);
2730                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2731                                         x++;
2732                                 }
2733                         }
2734                         else
2735                         {
2736                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2737                                 {
2738                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2739                                         tci = _mm_and_si128(tci, tcmax); 
2740                                         tci = _mm_madd_epi16(tci, tcoffset);
2741                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2742                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2743                                 }
2744                                 if (x <= endsub)
2745                                 {
2746                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2747                                         tci = _mm_and_si128(tci, tcmax); 
2748                                         tci = _mm_madd_epi16(tci, tcoffset);
2749                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2750                                         x++;
2751                                 }
2752                         }
2753                 }
2754         }
2755 #endif
2756 }
2757
2758 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2759 {
2760         // TODO: IMPLEMENT
2761         memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2762 }
2763
2764 float DPSOFTRAST_SampleShadowmap(const float *vector)
2765 {
2766         // TODO: IMPLEMENT
2767         return 1.0f;
2768 }
2769
2770 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2771 {
2772         int x;
2773         int startx = span->startx;
2774         int endx = span->endx;
2775         float c[4];
2776         float data[4];
2777         float slope[4];
2778         float z;
2779         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2780         for (x = startx;x < endx;x++)
2781         {
2782                 z = zf[x];
2783                 c[0] = (data[0] + slope[0]*x) * z;
2784                 c[1] = (data[1] + slope[1]*x) * z;
2785                 c[2] = (data[2] + slope[2]*x) * z;
2786                 c[3] = (data[3] + slope[3]*x) * z;
2787                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2788                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2789                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2790                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2791         }
2792 }
2793
2794 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2795 {
2796         int x;
2797         int startx = span->startx;
2798         int endx = span->endx;
2799         float c[4];
2800         float data[4];
2801         float slope[4];
2802         float z;
2803         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2804         for (x = startx;x < endx;x++)
2805         {
2806                 z = zf[x];
2807                 c[0] = (data[0] + slope[0]*x) * z;
2808                 c[1] = (data[1] + slope[1]*x) * z;
2809                 c[2] = (data[2] + slope[2]*x) * z;
2810                 c[3] = (data[3] + slope[3]*x) * z;
2811                 out4f[x*4+0] = c[0];
2812                 out4f[x*4+1] = c[1];
2813                 out4f[x*4+2] = c[2];
2814                 out4f[x*4+3] = c[3];
2815         }
2816 }
2817
2818 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2819 {
2820         int x, startx = span->startx, endx = span->endx;
2821         float c[4], localcolor[4];
2822         localcolor[0] = subcolor[0];
2823         localcolor[1] = subcolor[1];
2824         localcolor[2] = subcolor[2];
2825         localcolor[3] = subcolor[3];
2826         for (x = startx;x < endx;x++)
2827         {
2828                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2829                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2830                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2831                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2832                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2833                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2834                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2835                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2836         }
2837 }
2838
2839 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2840 {
2841         int x, startx = span->startx, endx = span->endx;
2842         for (x = startx;x < endx;x++)
2843         {
2844                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2845                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2846                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2847                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2848         }
2849 }
2850
2851 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2852 {
2853         int x, startx = span->startx, endx = span->endx;
2854         for (x = startx;x < endx;x++)
2855         {
2856                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2857                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2858                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2859                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2860         }
2861 }
2862
2863 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2864 {
2865         int x, startx = span->startx, endx = span->endx;
2866         float a, b;
2867         for (x = startx;x < endx;x++)
2868         {
2869                 a = 1.0f - inb4f[x*4+3];
2870                 b = inb4f[x*4+3];
2871                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2872                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2873                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2874                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2875         }
2876 }
2877
2878 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2879 {
2880         int x, startx = span->startx, endx = span->endx;
2881         float localcolor[4], ilerp, lerp;
2882         localcolor[0] = color[0];
2883         localcolor[1] = color[1];
2884         localcolor[2] = color[2];
2885         localcolor[3] = color[3];
2886         ilerp = 1.0f - localcolor[3];
2887         lerp = localcolor[3];
2888         for (x = startx;x < endx;x++)
2889         {
2890                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2891                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2892                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2893                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2894         }
2895 }
2896
2897
2898
2899 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2900 {
2901 #ifdef SSE2_PRESENT
2902         int x;
2903         int startx = span->startx;
2904         int endx = span->endx;
2905         __m128 data, slope;
2906         __m128 mod, endmod;
2907         __m128i submod, substep, endsubmod;
2908         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2909         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2910         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2911         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2912         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2913         for (x = startx; x < endx;)
2914         {
2915                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2916                 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2917                 if (nextsub >= endx)
2918                 {
2919                         nextsub = endsub = endx-1;
2920                         if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2921                 }
2922                 mod = endmod;
2923                 submod = endsubmod;
2924                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2925                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2926                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2927                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2928                 substep = _mm_packs_epi32(substep, substep);
2929                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2930                 {
2931                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2932                         pix = _mm_mulhi_epu16(pix, submod);
2933                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2934                 }
2935                 if (x <= endsub)
2936                 {
2937                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2938                         pix = _mm_mulhi_epu16(pix, submod);
2939                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2940                         x++;
2941                 }
2942         }
2943 #endif
2944 }
2945
2946 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2947 {
2948 #ifdef SSE2_PRESENT
2949         int x;
2950         int startx = span->startx;
2951         int endx = span->endx;
2952         __m128 data, slope;
2953         __m128 mod, endmod;
2954         __m128i submod, substep, endsubmod;
2955         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2956         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2957         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2958         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2959         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2960         for (x = startx; x < endx;)
2961         {
2962                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2963                 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2964                 if (nextsub >= endx)
2965                 {
2966                         nextsub = endsub = endx-1;
2967                         if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
2968                 }
2969                 mod = endmod;
2970                 submod = endsubmod;
2971                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2972                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2973                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2974                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2975                 substep = _mm_packs_epi32(substep, substep);
2976                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2977                 {
2978                         __m128i pix = _mm_srai_epi16(submod, 4);
2979                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2980                 }
2981                 if (x <= endsub)
2982                 {
2983                         __m128i pix = _mm_srai_epi16(submod, 4);
2984                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2985                         x++;
2986                 }
2987         }
2988 #endif
2989 }
2990
2991 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
2992 {
2993 #ifdef SSE2_PRESENT
2994         int x, startx = span->startx, endx = span->endx;
2995         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
2996         localcolor = _mm_packs_epi32(localcolor, localcolor);
2997         for (x = startx;x+2 <= endx;x+=2)
2998         {
2999                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3000                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3001                 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
3002                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3003         }
3004         if (x < endx)
3005         {
3006                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3007                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3008                 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
3009                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3010         }
3011 #endif
3012 }
3013
3014 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3015 {
3016 #ifdef SSE2_PRESENT
3017         int x, startx = span->startx, endx = span->endx;
3018         for (x = startx;x+2 <= endx;x+=2)
3019         {
3020                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3021                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3022                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3023                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3024         }
3025         if (x < endx)
3026         {
3027                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3028                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3029                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3030                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3031         }
3032 #endif
3033 }
3034
3035 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3036 {
3037 #ifdef SSE2_PRESENT
3038         int x, startx = span->startx, endx = span->endx;
3039         for (x = startx;x+2 <= endx;x+=2)
3040         {
3041                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3042                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3043                 pix1 = _mm_add_epi16(pix1, pix2);
3044                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3045         }
3046         if (x < endx)
3047         {
3048                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3049                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3050                 pix1 = _mm_add_epi16(pix1, pix2);
3051                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3052         }
3053 #endif
3054 }
3055
3056 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3057 {
3058 #ifdef SSE2_PRESENT
3059         int x, startx = span->startx, endx = span->endx;
3060         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3061         tint = _mm_packs_epi32(tint, tint);
3062         for (x = startx;x+2 <= endx;x+=2)
3063         {
3064                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3065                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3066                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3067                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3068         }
3069         if (x < endx)
3070         {
3071                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3072                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3073                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3074                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3075         }
3076 #endif
3077 }
3078
3079 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3080 {
3081 #ifdef SSE2_PRESENT
3082         int x, startx = span->startx, endx = span->endx;
3083         for (x = startx;x+2 <= endx;x+=2)
3084         {
3085                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3086                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3087                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3088                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3089                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3090         }
3091         if (x < endx)
3092         {
3093                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3094                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3095                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3096                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3097                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3098         }
3099 #endif
3100 }
3101
3102 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3103 {
3104 #ifdef SSE2_PRESENT
3105         int x, startx = span->startx, endx = span->endx;
3106         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3107         localcolor = _mm_packs_epi32(localcolor, localcolor);
3108         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3109         for (x = startx;x+2 <= endx;x+=2)
3110         {
3111                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3112                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3113                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3114         }
3115         if (x < endx)
3116         {
3117                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3118                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3119                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3120         }
3121 #endif
3122 }
3123
3124
3125
3126 void DPSOFTRAST_VertexShader_Generic(void)
3127 {
3128         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3129         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3130         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3131         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3132                 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3133 }
3134
3135 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3136 {
3137         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3138         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3139         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3140         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3141         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3142         if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3143         {
3144                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3145                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3146                 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3147                 {
3148                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3149                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3150                         {
3151                                 // multiply
3152                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3153                         }
3154                         else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3155                         {
3156                                 // add
3157                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3158                         }
3159                         else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3160                         {
3161                                 // alphablend
3162                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3163                         }
3164                 }
3165         }
3166         else
3167                 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3168         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3169 }
3170
3171
3172
3173 void DPSOFTRAST_VertexShader_PostProcess(void)
3174 {
3175         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3176         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3177         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3178 }
3179
3180 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3181 {
3182         // TODO: optimize!!  at the very least there is no reason to use texture sampling on the frame texture
3183         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3184         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3185         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3186         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3187         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3188         if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3189         {
3190                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3191                 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3192         }
3193         DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3194         if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3195         {
3196                 // TODO: implement saturation
3197         }
3198         if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3199         {
3200                 // TODO: implement gammaramps
3201         }
3202         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3203 }
3204
3205
3206
3207 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3208 {
3209         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3210 }
3211
3212 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3213 {
3214         // this is never called (because colormask is off when this shader is used)
3215         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3216         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3217         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3218         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3219         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3220 }
3221
3222
3223
3224 void DPSOFTRAST_VertexShader_FlatColor(void)
3225 {
3226         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3227         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3228 }
3229
3230 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3231 {
3232 #ifdef SSE2_PRESENT
3233         unsigned char * RESTRICT pixelmask = span->pixelmask;
3234         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3235         int x, startx = span->startx, endx = span->endx;
3236         __m128i Color_Ambientm;
3237         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3238         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3239         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3240         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3241         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3242         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3243                 pixel = buffer_FragColorbgra8;
3244         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3245         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3246         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3247         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3248         for (x = startx;x < endx;x++)
3249         {
3250                 __m128i color, pix;
3251                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3252                 {
3253                         __m128i pix2;
3254                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3255                         pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3256                         pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3257                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3258                         x += 3;
3259                         continue;
3260                 }
3261                 if (!pixelmask[x])
3262                         continue;
3263                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3264                 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3265                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3266         }
3267         if (pixel == buffer_FragColorbgra8)
3268                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3269 #endif
3270 }
3271
3272
3273
3274 void DPSOFTRAST_VertexShader_VertexColor(void)
3275 {
3276         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3277         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3278         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3279 }
3280
3281 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3282 {
3283 #ifdef SSE2_PRESENT
3284         unsigned char * RESTRICT pixelmask = span->pixelmask;
3285         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3286         int x, startx = span->startx, endx = span->endx;
3287         __m128i Color_Ambientm, Color_Diffusem;
3288         __m128 data, slope;
3289         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3290         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3291         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3292         int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3293         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3294         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3295         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3296                 pixel = buffer_FragColorbgra8;
3297         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3298         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3299         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3300         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3301         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3302         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3303         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3304         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3305         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3306         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3307         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3308         data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3309         slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3310         for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3311         {
3312                 __m128i color, mod, pix;
3313                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3314                 {
3315                         __m128i pix2, mod2;
3316                         __m128 z = _mm_loadu_ps(&buffer_z[x]);
3317                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3318                         mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3319                         data = _mm_add_ps(data, slope);
3320                         mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3321                         data = _mm_add_ps(data, slope);
3322                         mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3323                         data = _mm_add_ps(data, slope);
3324                         mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3325                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3326                                                                   _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3327                         pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3328                                                                    _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3329                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3330                         x += 3;
3331                         continue;
3332                 }
3333                 if (!pixelmask[x])
3334                         continue;
3335                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3336                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
3337                 mod = _mm_packs_epi32(mod, mod);
3338                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3339                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3340         }
3341         if (pixel == buffer_FragColorbgra8)
3342                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3343 #endif
3344 }
3345
3346
3347
3348 void DPSOFTRAST_VertexShader_Lightmap(void)
3349 {
3350         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3351         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3352         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3353 }
3354
3355 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3356 {
3357 #ifdef SSE2_PRESENT
3358         unsigned char * RESTRICT pixelmask = span->pixelmask;
3359         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3360         int x, startx = span->startx, endx = span->endx;
3361         __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3362         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3363         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3364         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3365         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3366         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3367         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3368         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3369         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3370         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3371                 pixel = buffer_FragColorbgra8;
3372         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3373         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3374         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3375         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3376         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3377         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3378         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3379         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3380         {
3381                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3382                 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3383                 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3384                 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3385                 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3386                 for (x = startx;x < endx;x++)
3387                 {
3388                         __m128i color, lightmap, glow, pix;
3389                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3390                         {
3391                                 __m128i pix2;
3392                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3393                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3394                                 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3395                                 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3396                                                                                                         _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3397                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3398                                 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3399                                                                                                         _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3400                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3401                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3402                                 x += 3;
3403                                 continue;
3404                         }
3405                         if (!pixelmask[x])
3406                                 continue;
3407                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3408                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3409                         glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3410                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3411                         pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3412                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3413                 }
3414         }
3415         else
3416         {
3417                 for (x = startx;x < endx;x++)
3418                 {
3419                         __m128i color, lightmap, pix;
3420                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3421                         {
3422                                 __m128i pix2;
3423                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3424                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3425                                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3426                                                                           _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3427                                 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3428                                                                            _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3429                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3430                                 x += 3;
3431                                 continue;
3432                         }
3433                         if (!pixelmask[x]) 
3434                                 continue;
3435                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3436                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3437                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3438                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3439                 }
3440         }
3441         if (pixel == buffer_FragColorbgra8)
3442                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3443 #endif
3444 }
3445
3446
3447
3448 void DPSOFTRAST_VertexShader_FakeLight(void)
3449 {
3450         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3451 }
3452
3453 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3454 {
3455         // TODO: IMPLEMENT
3456         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3457         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3458         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3459         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3460         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3461 }
3462
3463
3464
3465 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3466 {
3467         DPSOFTRAST_VertexShader_Lightmap();
3468 }
3469
3470 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3471 {
3472         DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3473         // TODO: IMPLEMENT
3474 }
3475
3476
3477
3478 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3479 {
3480         DPSOFTRAST_VertexShader_Lightmap();
3481 }
3482
3483 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3484 {
3485         DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3486         // TODO: IMPLEMENT
3487 }
3488
3489
3490
3491 void DPSOFTRAST_VertexShader_LightDirection(void)
3492 {
3493         int i;
3494         int numvertices = dpsoftrast.numvertices;
3495         float LightDir[4];
3496         float LightVector[4];
3497         float EyePosition[4];
3498         float EyeVectorModelSpace[4];
3499         float EyeVector[4];
3500         float position[4];
3501         float svector[4];
3502         float tvector[4];
3503         float normal[4];
3504         LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3505         LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3506         LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3507         LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3508         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3509         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3510         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3511         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3512         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3513         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3514         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3515         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3516         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3517         for (i = 0;i < numvertices;i++)
3518         {
3519                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3520                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3521                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3522                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3523                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3524                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3525                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3526                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3527                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3528                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3529                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3530                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3531                 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3532                 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3533                 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3534                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3535                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3536                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3537                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3538                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3539                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3540                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3541                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3542                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3543                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3544                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3545                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3546                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3547                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3548         }
3549         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3550 }
3551
3552 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3553 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3554 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3555 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3556 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3557 #define DPSOFTRAST_Vector3Normalize(v)\
3558 do\
3559 {\
3560         float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3561         if (len)\
3562         {\
3563                 len = 1.0f / len;\
3564                 v[0] *= len;\
3565                 v[1] *= len;\
3566                 v[2] *= len;\
3567         }\
3568 }\
3569 while(0)
3570
3571 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3572 {
3573         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3574         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3575         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3576         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3577         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3578         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3579         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3580         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3581         int x, startx = span->startx, endx = span->endx;
3582         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3583         float LightVectordata[4];
3584         float LightVectorslope[4];
3585         float EyeVectordata[4];
3586         float EyeVectorslope[4];
3587         float z;
3588         float diffusetex[4];
3589         float glosstex[4];
3590         float surfacenormal[4];
3591         float lightnormal[4];
3592         float eyenormal[4];
3593         float specularnormal[4];
3594         float diffuse;
3595         float specular;
3596         float SpecularPower;
3597         int d[4];
3598         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3599         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3600         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3601         Color_Glow[3] = 0.0f;
3602         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3603         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3604         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3605         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3606         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3607         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3608         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3609         Color_Pants[3] = 0.0f;
3610         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3611         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3612         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3613         Color_Shirt[3] = 0.0f;
3614         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3615         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3616         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3617         {
3618                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3619                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3620         }
3621         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3622         {
3623                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3624         }
3625         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3626         {
3627                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3628                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3629                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3630                 Color_Diffuse[3] = 0.0f;
3631                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3632                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3633                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3634                 LightColor[3] = 0.0f;
3635                 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3636                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3637                 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3638                 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3639                 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3640                 Color_Specular[3] = 0.0f;
3641                 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3642                 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3643                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3644                 for (x = startx;x < endx;x++)
3645                 {
3646                         z = buffer_z[x];
3647                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3648                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3649                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3650                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3651                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3652                         {
3653                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3654                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3655                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3656                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3657                         }
3658                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3659                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3660                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3661                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3662                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3663                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3664                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3665                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3666
3667                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3668                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3669                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3670                         DPSOFTRAST_Vector3Normalize(lightnormal);
3671
3672                         eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3673                         eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3674                         eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3675                         DPSOFTRAST_Vector3Normalize(eyenormal);
3676
3677                         specularnormal[0] = lightnormal[0] + eyenormal[0];
3678                         specularnormal[1] = lightnormal[1] + eyenormal[1];
3679                         specularnormal[2] = lightnormal[2] + eyenormal[2];
3680                         DPSOFTRAST_Vector3Normalize(specularnormal);
3681
3682                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3683                         specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3684                         specular = pow(specular, SpecularPower * glosstex[3]);
3685                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3686                         {
3687                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3688                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3689                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3690                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3691                         }
3692                         else
3693                         {
3694                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3695                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3696                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3697                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3698                         }
3699                         buffer_FragColorbgra8[x*4+0] = d[0];
3700                         buffer_FragColorbgra8[x*4+1] = d[1];
3701                         buffer_FragColorbgra8[x*4+2] = d[2];
3702                         buffer_FragColorbgra8[x*4+3] = d[3];
3703                 }
3704         }
3705         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3706         {
3707                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3708                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3709                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3710                 Color_Diffuse[3] = 0.0f;
3711                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3712                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3713                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3714                 LightColor[3] = 0.0f;
3715                 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3716                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3717                 for (x = startx;x < endx;x++)
3718                 {
3719                         z = buffer_z[x];
3720                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3721                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3722                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3723                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3724                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3725                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3726                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3727                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3728
3729                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3730                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3731                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3732                         DPSOFTRAST_Vector3Normalize(lightnormal);
3733
3734                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3735                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3736                         {
3737                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3738                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3739                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3740                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3741                         }
3742                         else
3743                         {
3744                                 d[0] = (int)(                                                + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3745                                 d[1] = (int)(                                                + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3746                                 d[2] = (int)(                                                + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3747                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3748                         }
3749                         buffer_FragColorbgra8[x*4+0] = d[0];
3750                         buffer_FragColorbgra8[x*4+1] = d[1];
3751                         buffer_FragColorbgra8[x*4+2] = d[2];
3752                         buffer_FragColorbgra8[x*4+3] = d[3];
3753                 }
3754         }
3755         else
3756         {
3757                 for (x = startx;x < endx;x++)
3758                 {
3759                         z = buffer_z[x];
3760                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3761                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3762                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3763                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3764
3765                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3766                         {
3767                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3768                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3769                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3770                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3771                         }
3772                         else
3773                         {
3774                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3775                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3776                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3777                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3778                         }
3779                         buffer_FragColorbgra8[x*4+0] = d[0];
3780                         buffer_FragColorbgra8[x*4+1] = d[1];
3781                         buffer_FragColorbgra8[x*4+2] = d[2];
3782                         buffer_FragColorbgra8[x*4+3] = d[3];
3783                 }
3784         }
3785         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3786 }
3787
3788
3789
3790 void DPSOFTRAST_VertexShader_LightSource(void)
3791 {
3792         int i;
3793         int numvertices = dpsoftrast.numvertices;
3794         float LightPosition[4];
3795         float LightVector[4];
3796         float LightVectorModelSpace[4];
3797         float EyePosition[4];
3798         float EyeVectorModelSpace[4];
3799         float EyeVector[4];
3800         float position[4];
3801         float svector[4];
3802         float tvector[4];
3803         float normal[4];
3804         LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
3805         LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
3806         LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
3807         LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
3808         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3809         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3810         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3811         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3812         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3813         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3814         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3815         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3816         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3817         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3818         for (i = 0;i < numvertices;i++)
3819         {
3820                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3821                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3822                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3823                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3824                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3825                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3826                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3827                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3828                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3829                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3830                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3831                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3832                 LightVectorModelSpace[0] = LightPosition[0] - position[0];
3833                 LightVectorModelSpace[1] = LightPosition[1] - position[1];
3834                 LightVectorModelSpace[2] = LightPosition[2] - position[2];
3835                 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
3836                 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
3837                 LightVector[2] = normal[0]  * LightVectorModelSpace[0] + normal[1]  * LightVectorModelSpace[1] + normal[2]  * LightVectorModelSpace[2];
3838                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3839                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3840                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3841                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3842                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3843                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3844                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3845                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3846                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3847                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3848                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3849                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3850                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3851                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3852         }
3853         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3854         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
3855 }
3856
3857 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3858 {
3859 #ifdef SSE2_PRESENT
3860         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3861         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3862         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3863         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3864         unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3865         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3866         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3867         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3868         int x, startx = span->startx, endx = span->endx;
3869         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3870         float CubeVectordata[4];
3871         float CubeVectorslope[4];
3872         float LightVectordata[4];
3873         float LightVectorslope[4];
3874         float EyeVectordata[4];
3875         float EyeVectorslope[4];
3876         float z;
3877         float diffusetex[4];
3878         float glosstex[4];
3879         float surfacenormal[4];
3880         float lightnormal[4];
3881         float eyenormal[4];
3882         float specularnormal[4];
3883         float diffuse;
3884         float specular;
3885         float SpecularPower;
3886         float CubeVector[4];
3887         float attenuation;
3888         int d[4];
3889         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3890         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3891         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3892         Color_Glow[3] = 0.0f;
3893         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3894         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3895         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3896         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3897         Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3898         Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3899         Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3900         Color_Diffuse[3] = 0.0f;
3901         Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3902         Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3903         Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3904         Color_Specular[3] = 0.0f;
3905         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3906         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3907         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3908         Color_Pants[3] = 0.0f;
3909         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3910         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3911         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3912         Color_Shirt[3] = 0.0f;
3913         LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3914         LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3915         LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3916         LightColor[3] = 0.0f;
3917         SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3918         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3919         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3920         DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3921         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3922         memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
3923         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3924         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3925         {
3926                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3927                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3928         }
3929         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3930                 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
3931         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3932         {
3933                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3934                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3935                 for (x = startx;x < endx;x++)
3936                 {
3937                         z = buffer_z[x];
3938                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3939                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3940                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3941                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3942                         if (attenuation < 0.01f)
3943                                 continue;
3944                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3945                         {
3946                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3947                                 if (attenuation < 0.01f)
3948                                         continue;
3949                         }
3950
3951                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3952                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3953                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3954                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3955                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3956                         {
3957                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3958                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3959                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3960                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3961                         }
3962                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3963                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3964                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3965                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3966                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3967                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3968                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3969                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3970
3971                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3972                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3973                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3974                         DPSOFTRAST_Vector3Normalize(lightnormal);
3975
3976                         eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3977                         eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3978                         eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3979                         DPSOFTRAST_Vector3Normalize(eyenormal);
3980
3981                         specularnormal[0] = lightnormal[0] + eyenormal[0];
3982                         specularnormal[1] = lightnormal[1] + eyenormal[1];
3983                         specularnormal[2] = lightnormal[2] + eyenormal[2];
3984                         DPSOFTRAST_Vector3Normalize(specularnormal);
3985
3986                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3987                         specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3988                         specular = pow(specular, SpecularPower * glosstex[3]);
3989                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3990                         {
3991                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3992                                 attenuation *= (1.0f / 255.0f);
3993                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3994                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3995                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3996                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
3997                         }
3998                         else
3999                         {
4000                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4001                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4002                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4003                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4004                         }
4005                         buffer_FragColorbgra8[x*4+0] = d[0];
4006                         buffer_FragColorbgra8[x*4+1] = d[1];
4007                         buffer_FragColorbgra8[x*4+2] = d[2];
4008                         buffer_FragColorbgra8[x*4+3] = d[3];
4009                 }
4010         }
4011         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4012         {
4013                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4014                 for (x = startx;x < endx;x++)
4015                 {
4016                         z = buffer_z[x];
4017                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4018                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4019                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4020                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4021                         if (attenuation < 0.01f)
4022                                 continue;
4023                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4024                         {
4025                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4026                                 if (attenuation < 0.01f)
4027                                         continue;
4028                         }
4029
4030                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4031                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4032                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4033                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4034                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4035                         {
4036                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4037                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4038                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4039                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4040                         }
4041                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4042                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4043                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4044                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4045
4046                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4047                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4048                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4049                         DPSOFTRAST_Vector3Normalize(lightnormal);
4050
4051                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4052                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4053                         {
4054                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4055                                 attenuation *= (1.0f / 255.0f);
4056                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4057                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4058                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4059                                 d[3] = (int)( diffusetex[3]                                                                                                                   );if (d[3] > 255) d[3] = 255;
4060                         }
4061                         else
4062                         {
4063                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4064                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4065                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4066                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4067                         }
4068                         buffer_FragColorbgra8[x*4+0] = d[0];
4069                         buffer_FragColorbgra8[x*4+1] = d[1];
4070                         buffer_FragColorbgra8[x*4+2] = d[2];
4071                         buffer_FragColorbgra8[x*4+3] = d[3];
4072                 }
4073         }
4074         else
4075         {
4076                 for (x = startx;x < endx;x++)
4077                 {
4078                         z = buffer_z[x];
4079                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4080                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4081                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4082                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4083                         if (attenuation < 0.01f)
4084                                 continue;
4085                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4086                         {
4087                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4088                                 if (attenuation < 0.01f)
4089                                         continue;
4090                         }
4091
4092                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4093                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4094                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4095                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4096                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4097                         {
4098                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4099                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4100                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4101                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4102                         }
4103                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4104                         {
4105                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4106                                 attenuation *= (1.0f / 255.0f);
4107                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4108                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4109                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4110                                 d[3] = (int)( diffusetex[3]                                                                                      );if (d[3] > 255) d[3] = 255;
4111                         }
4112                         else
4113                         {
4114                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4115                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4116                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4117                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4118                         }
4119                         buffer_FragColorbgra8[x*4+0] = d[0];
4120                         buffer_FragColorbgra8[x*4+1] = d[1];
4121                         buffer_FragColorbgra8[x*4+2] = d[2];
4122                         buffer_FragColorbgra8[x*4+3] = d[3];
4123                 }
4124         }
4125         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4126 #endif
4127 }
4128
4129
4130
4131 void DPSOFTRAST_VertexShader_Refraction(void)
4132 {
4133         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4134 }
4135
4136 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4137 {
4138         // TODO: IMPLEMENT
4139         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4140         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4141         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4142         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4143         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4144 }
4145
4146
4147
4148 void DPSOFTRAST_VertexShader_Water(void)
4149 {
4150         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4151 }
4152
4153
4154 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4155 {
4156         // TODO: IMPLEMENT
4157         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4158         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4159         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4160         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4161         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4162 }
4163
4164
4165
4166 void DPSOFTRAST_VertexShader_ShowDepth(void)
4167 {
4168         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4169 }
4170
4171 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4172 {
4173         // TODO: IMPLEMENT
4174         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4175         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4176         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4177         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4178         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4179 }
4180
4181
4182
4183 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4184 {
4185         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4186 }
4187
4188 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4189 {
4190         // TODO: IMPLEMENT
4191         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4192         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4193         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4194         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4195         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4196 }
4197
4198
4199
4200 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4201 {
4202         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4203 }
4204
4205 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4206 {
4207         // TODO: IMPLEMENT
4208         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4209         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4210         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4211         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4212         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4213 }
4214
4215
4216
4217 typedef struct DPSOFTRAST_ShaderModeInfo_s
4218 {
4219         int lodarrayindex;
4220         void (*Vertex)(void);
4221         void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4222         unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4223         unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4224 }
4225 DPSOFTRAST_ShaderModeInfo;
4226
4227 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4228 {
4229         {2, DPSOFTRAST_VertexShader_Generic,                        DPSOFTRAST_PixelShader_Generic,                        {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4230         {2, DPSOFTRAST_VertexShader_PostProcess,                    DPSOFTRAST_PixelShader_PostProcess,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4231         {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow,                DPSOFTRAST_PixelShader_Depth_Or_Shadow,                {~0}, {~0}},
4232         {2, DPSOFTRAST_VertexShader_FlatColor,                      DPSOFTRAST_PixelShader_FlatColor,                      {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4233         {2, DPSOFTRAST_VertexShader_VertexColor,                    DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4234         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4235         {2, DPSOFTRAST_VertexShader_FakeLight,                      DPSOFTRAST_PixelShader_FakeLight,                      {~0}, {~0}},
4236         {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace,   {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4237         {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4238         {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,                 {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4239         {2, DPSOFTRAST_VertexShader_LightSource,                    DPSOFTRAST_PixelShader_LightSource,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4240         {2, DPSOFTRAST_VertexShader_Refraction,                     DPSOFTRAST_PixelShader_Refraction,                     {~0}},
4241         {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                          {~0}},
4242         {2, DPSOFTRAST_VertexShader_ShowDepth,                      DPSOFTRAST_PixelShader_ShowDepth,                      {~0}},
4243         {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,               {~0}},
4244         {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}},
4245 };
4246
4247 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4248 {
4249         int i;
4250         int x;
4251         int startx;
4252         int endx;
4253 //      unsigned int c;
4254 //      unsigned int *colorpixel;
4255         unsigned int *depthpixel;
4256         float w;
4257         float wslope;
4258         int depth;
4259         int depthslope;
4260         unsigned int d;
4261         DPSOFTRAST_State_Triangle *triangle;
4262         DPSOFTRAST_State_Span *span;
4263         unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4264         for (i = 0; i < thread->numspans; i++)
4265         {
4266                 span = &thread->spans[i];
4267                 triangle = &thread->triangles[span->triangle];
4268                 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4269                 {
4270                         wslope = triangle->w[0];
4271                         w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4272                         depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4273                         depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4274                         depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4275                         startx = span->startx;
4276                         endx = span->endx;
4277                         switch(thread->fb_depthfunc)
4278                         {
4279                         default:
4280                         case GL_ALWAYS:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4281                         case GL_LESS:    for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4282                         case GL_LEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4283                         case GL_EQUAL:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4284                         case GL_GEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4285                         case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4286                         case GL_NEVER:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4287                         }
4288                         //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4289                         //for (x = startx;x < endx;x++)
4290                         //      colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4291                         // if there is no color buffer, skip pixel shader
4292                         while (startx < endx && !pixelmask[startx])
4293                                 startx++;
4294                         while (endx > startx && !pixelmask[endx-1])
4295                                 endx--;
4296                         if (startx >= endx)
4297                                 continue; // no pixels to fill
4298                         span->pixelmask = pixelmask;
4299                         span->startx = startx;
4300                         span->endx = endx;
4301                         // run pixel shader if appropriate
4302                         // do this before running depthmask code, to allow the pixelshader
4303                         // to clear pixelmask values for alpha testing
4304                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4305                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4306                         if (thread->depthmask)
4307                                 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4308                                         if (pixelmask[x])
4309                                                 depthpixel[x] = d;
4310                 }
4311                 else
4312                 {
4313                         // no depth testing means we're just dealing with color...
4314                         // if there is no color buffer, skip pixel shader
4315                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4316                         {
4317                                 memset(pixelmask + span->startx, 1, span->endx - span->startx);
4318                                 span->pixelmask = pixelmask;
4319                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4320                         }
4321                 }
4322         }
4323         thread->numspans = 0;
4324 }
4325
4326 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4327
4328 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4329 {
4330 #ifdef SSE2_PRESENT
4331         int cullface = thread->cullface;
4332         int minx, maxx, miny, maxy;
4333         int miny1, maxy1, miny2, maxy2;
4334         __m128i fbmin, fbmax;
4335         __m128 viewportcenter, viewportscale;
4336         int firstvertex = command->firstvertex;
4337         int numvertices = command->numvertices;
4338         int numtriangles = command->numtriangles;
4339         const int *element3i = command->element3i;
4340         const unsigned short *element3s = command->element3s;
4341         int clipped = command->clipped;
4342         int i;
4343         int j;
4344         int k;
4345         int y;
4346         int e[3];
4347         __m128i screeny;
4348         int starty, endy, bandy;
4349         int numpoints;
4350         int clipcase;
4351         float clipdist[4];
4352         __m128 triangleedge1, triangleedge2, trianglenormal;
4353         __m128 clipfrac[3];
4354         __m128 screen[4];
4355         DPSOFTRAST_State_Triangle *triangle;
4356         DPSOFTRAST_Texture *texture;
4357         DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4358         miny = thread->fb_scissor[1];
4359         maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4360         miny1 = bound(miny, thread->miny1, maxy);
4361         maxy1 = bound(miny, thread->maxy1, maxy);
4362         miny2 = bound(miny, thread->miny2, maxy);
4363         maxy2 = bound(miny, thread->maxy2, maxy);
4364         if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4365         {
4366                 if (!ATOMIC_DECREMENT(command->refcount))
4367                 {
4368                         if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4369                                 MM_FREE(command->arrays);
4370                 }
4371                 return;
4372         }
4373         minx = thread->fb_scissor[0];
4374         maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4375         fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4376         fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4377         viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4378         viewportscale = _mm_load_ps(thread->fb_viewportscale);
4379         screen[3] = _mm_setzero_ps();
4380         clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4381         for (i = 0;i < numtriangles;i++)
4382         {
4383                 const float *screencoord4f = command->arrays;
4384                 const float *arrays = screencoord4f + numvertices*4;
4385
4386                 // generate the 3 edges of this triangle
4387                 // generate spans for the triangle - switch based on left split or right split classification of triangle
4388                 if (element3s)
4389                 {
4390                         e[0] = element3s[i*3+0] - firstvertex;
4391                         e[1] = element3s[i*3+1] - firstvertex;
4392                         e[2] = element3s[i*3+2] - firstvertex;
4393                 }
4394                 else if (element3i)
4395                 {
4396                         e[0] = element3i[i*3+0] - firstvertex;
4397                         e[1] = element3i[i*3+1] - firstvertex;
4398                         e[2] = element3i[i*3+2] - firstvertex;
4399                 }
4400                 else
4401                 {
4402                         e[0] = i*3+0;
4403                         e[1] = i*3+1;
4404                         e[2] = i*3+2;
4405                 }
4406
4407 #define SKIPBACKFACE \
4408                 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4409                 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4410                 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4411                 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4412                                                                         _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4413                 switch(cullface) \
4414                 { \
4415                 case GL_BACK: \
4416                         if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4417                                 continue; \
4418                         break; \
4419                 case GL_FRONT: \
4420                         if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4421                                 continue; \
4422                         break; \
4423                 }
4424
4425 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4426                         clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4427                         { \
4428                                 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4429                                 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4430                         }
4431 #define CLIPPEDVERTEXCOPY(k,p1) \
4432                         screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4433
4434 #define GENATTRIBCOPY(attrib, p1) \
4435                 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4436 #define GENATTRIBLERP(attrib, p1, p2) \
4437                 { \
4438                         __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4439                         attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4440                 }
4441 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4442                 switch(clipcase) \
4443                 { \
4444                 default: \
4445                 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4446                 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4447                 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4448                 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4449                 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4450                 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4451                 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4452                 }
4453
4454                 if (! clipped)
4455                         goto notclipped;
4456
4457                 // calculate distance from nearplane
4458                 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4459                 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4460                 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4461                 if (clipdist[0] >= 0.0f)
4462                 {
4463                         if (clipdist[1] >= 0.0f)
4464                         {
4465                                 if (clipdist[2] >= 0.0f)
4466                                 {
4467                                 notclipped:
4468                                         // triangle is entirely in front of nearplane
4469                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4470                                         SKIPBACKFACE;
4471                                         numpoints = 3;
4472                                         clipcase = 0;
4473                                 }
4474                                 else
4475                                 {
4476                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4477                                         SKIPBACKFACE;
4478                                         numpoints = 4;
4479                                         clipcase = 1;
4480                                 }
4481                         }
4482                         else
4483                         {
4484                                 if (clipdist[2] >= 0.0f)
4485                                 {
4486                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4487                                         SKIPBACKFACE;
4488                                         numpoints = 4;
4489                                         clipcase = 2;
4490                                 }
4491                                 else
4492                                 {
4493                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4494                                         SKIPBACKFACE;
4495                                         numpoints = 3;
4496                                         clipcase = 3;
4497                                 }
4498                         }
4499                 }
4500                 else if (clipdist[1] >= 0.0f)
4501                 {
4502                         if (clipdist[2] >= 0.0f)
4503                         {
4504                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4505                                 SKIPBACKFACE;
4506                                 numpoints = 4;
4507                                 clipcase = 4;
4508                         }
4509                         else
4510                         {
4511                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4512                                 SKIPBACKFACE;
4513                                 numpoints = 3;
4514                                 clipcase = 5;
4515                         }
4516                 }
4517                 else if (clipdist[2] >= 0.0f)
4518                 {
4519                         CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4520                         SKIPBACKFACE;
4521                         numpoints = 3;
4522                         clipcase = 6;
4523                 }
4524                 else continue; // triangle is entirely behind nearplane
4525
4526                 {
4527                         // calculate integer y coords for triangle points
4528                         __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4529                                         screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4530                                         screenmin = _mm_min_epi16(screeni, screenir),
4531                                         screenmax = _mm_max_epi16(screeni, screenir);
4532                         screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4533                         screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4534                         screenmin = _mm_max_epi16(screenmin, fbmin);
4535                         screenmax = _mm_min_epi16(screenmax, fbmax);
4536                         // skip offscreen triangles
4537                         if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4538                                 continue;
4539                         starty = _mm_extract_epi16(screenmin, 1);
4540                         endy = _mm_extract_epi16(screenmax, 1)+1;
4541                         if (starty >= maxy1 && endy <= miny2)
4542                                 continue;
4543                         screeny = _mm_srai_epi32(screeni, 16);
4544                 }
4545
4546                 triangle = &thread->triangles[thread->numtriangles];
4547
4548                 // calculate attribute plans for triangle data...
4549                 // okay, this triangle is going to produce spans, we'd better project
4550                 // the interpolants now (this is what gives perspective texturing),
4551                 // this consists of simply multiplying all arrays by the W coord
4552                 // (which is basically 1/Z), which will be undone per-pixel
4553                 // (multiplying by Z again) to get the perspective-correct array
4554                 // values
4555                 {
4556                         __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4557                         __m128 mipedgescale, mipdensity;
4558                         attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4559                         attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4560                         attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4561                         attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4562                         attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4563                         w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4564                         w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4565                         w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4566                         attribedge1 = _mm_sub_ss(w0, w1);
4567                         attribedge2 = _mm_sub_ss(w2, w1);
4568                         attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4569                         attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4570                         x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4571                         y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4572                         attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4573                         _mm_store_ss(&triangle->w[0], attribxslope);
4574                         _mm_store_ss(&triangle->w[1], attribyslope);
4575                         _mm_store_ss(&triangle->w[2], attriborigin);
4576                         mipedgescale = _mm_setzero_ps();
4577                         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4578                         {
4579                                 __m128 attrib0, attrib1, attrib2;
4580                                 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4581                                 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4582                                         break;
4583                                 arrays += numvertices*4;
4584                                 GENATTRIBS(attrib0, attrib1, attrib2);
4585                                 attriborigin = _mm_mul_ps(attrib1, w1);
4586                                 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4587                                 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4588                                 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4589                                 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4590                                 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4591                                 _mm_stream_ps(triangle->attribs[k][0], attribxslope);
4592                                 _mm_stream_ps(triangle->attribs[k][1], attribyslope);
4593                                 _mm_stream_ps(triangle->attribs[k][2], attriborigin);
4594                                 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4595                                 {
4596                                         mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4597                                         mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4598                                         mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4599                                         mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4600                                 }
4601                         }
4602
4603                         memset(triangle->mip, 0, sizeof(triangle->mip));
4604                         for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4605                         {
4606                                 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4607                                 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4608                                         break;
4609                                 texture = thread->texbound[texunit];
4610                                 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4611                                 {
4612                                         mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4613                                         mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4614                                         mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4615                                         mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4616                                         // this will be multiplied in the texturing routine by the texture resolution
4617                                         y = _mm_cvtss_si32(mipdensity);
4618                                         if (y > 0)
4619                                         {
4620                                                 y = (int)(log((float)y)*0.5f/M_LN2);
4621                                                 if (y > texture->mipmaps - 1)
4622                                                         y = texture->mipmaps - 1;
4623                                                 triangle->mip[texunit] = y;
4624                                         }
4625                                 }
4626                         }
4627                 }
4628         
4629                 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
4630                 for (; y < bandy;)
4631                 {
4632                         __m128 xcoords, xslope;
4633                         __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
4634                         int yccmask = _mm_movemask_epi8(ycc);
4635                         int edge0p, edge0n, edge1p, edge1n;
4636                         int nexty;
4637                         if (numpoints == 4)
4638                         {
4639                                 switch(yccmask)
4640                                 {
4641                                 default:
4642                                 case 0xFFFF: /*0000*/ y = endy; continue;
4643                                 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
4644                                 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4645                                 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
4646                                 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
4647                                 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
4648                                 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
4649                                 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
4650                                 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
4651                                 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
4652                                 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
4653                                 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
4654                                 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
4655                                 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4656                                 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
4657                                 case 0x0000: /*1111*/ y++; continue;
4658                                 }
4659                         }
4660                         else
4661                         {
4662                                 switch(yccmask)
4663                                 {
4664                                 default:
4665                                 case 0xFFFF: /*000*/ y = endy; continue;
4666                                 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
4667                                 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4668                                 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
4669                                 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
4670                                 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4671                                 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
4672                                 case 0x0000: /*111*/ y++; continue;
4673                                 }
4674                         }
4675                         ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
4676                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
4677                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
4678                         nexty = _mm_extract_epi16(ycc, 0);
4679                         if (nexty >= bandy) nexty = bandy-1;
4680                         xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
4681                         xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
4682                         xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
4683                                                                 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
4684                         xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
4685                         if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
4686                         {
4687                                 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
4688                                 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
4689                         }
4690                         for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
4691                         {
4692                                 int startx, endx, offset;
4693                                 startx = _mm_cvtss_si32(xcoords);
4694                                 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
4695                                 if (startx < minx) 
4696                                 {
4697                                         if (startx < 0) startx = 0;
4698                                         startx += (minx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
4699                                 }
4700                                 if (endx > maxx) endx = maxx;
4701                                 if (startx >= endx) continue;
4702                                 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
4703                                 {
4704                                         DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
4705                                         span->triangle = thread->numtriangles;
4706                                         span->x = offset;
4707                                         span->y = y;
4708                                         span->startx = max(minx - offset, 0);
4709                                         span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
4710                                         if (span->startx >= span->endx)
4711                                                 continue; 
4712                                         if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
4713                                                 DPSOFTRAST_Draw_ProcessSpans(thread);
4714                                 }
4715                         }
4716                 }
4717
4718                 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
4719                 {
4720                         DPSOFTRAST_Draw_ProcessSpans(thread);
4721                         thread->numtriangles = 0;
4722                 }
4723         }
4724
4725         if (!ATOMIC_DECREMENT(command->refcount))
4726         {
4727                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4728                         MM_FREE(command->arrays);
4729         }
4730
4731         if (thread->numspans > 0 || thread->numtriangles > 0)
4732         {
4733                 DPSOFTRAST_Draw_ProcessSpans(thread);
4734                 thread->numtriangles = 0;
4735         }
4736 #endif
4737 }
4738
4739 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4740 {
4741         int i;
4742         int j;
4743         int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
4744         int datasize = 2*numvertices*sizeof(float[4]);
4745         DPSOFTRAST_Command_Draw *command;
4746         unsigned char *data;
4747         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4748         {
4749                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4750                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4751                         break;
4752                 datasize += numvertices*sizeof(float[4]);
4753         }
4754         if (element3s)
4755                 datasize += numtriangles*sizeof(unsigned short[3]);
4756         else if (element3i)
4757                 datasize += numtriangles*sizeof(int[3]);
4758         datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
4759         if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
4760         {
4761                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
4762                 data = (unsigned char *)MM_CALLOC(datasize, 1);
4763         }
4764         else
4765         {
4766                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
4767                 data = (unsigned char *)command + commandsize;
4768         }
4769         command->firstvertex = firstvertex;
4770         command->numvertices = numvertices;
4771         command->numtriangles = numtriangles;
4772         command->arrays = (float *)data;
4773         memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
4774         dpsoftrast.firstvertex = firstvertex;
4775         dpsoftrast.numvertices = numvertices;
4776         dpsoftrast.screencoord4f = (float *)data;
4777         data += numvertices*sizeof(float[4]);
4778         dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
4779         data += numvertices*sizeof(float[4]);
4780         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4781         {
4782                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4783                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4784                         break;
4785                 dpsoftrast.post_array4f[j] = (float *)data;
4786                 data += numvertices*sizeof(float[4]);
4787         }
4788         command->element3i = NULL;
4789         command->element3s = NULL;
4790         if (element3s)
4791         {
4792                 command->element3s = (unsigned short *)data;
4793                 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
4794         }
4795         else if (element3i)
4796         {
4797                 command->element3i = (int *)data;
4798                 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
4799         }
4800         return command;
4801 }
4802
4803 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4804 {
4805         DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
4806         DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
4807         command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
4808         command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
4809         if (command->starty >= command->endy)
4810         {
4811                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4812                         MM_FREE(command->arrays);
4813                 DPSOFTRAST_UndoCommand(command->commandsize);
4814                 return;
4815         }
4816         command->clipped = dpsoftrast.drawclipped;
4817         command->refcount = dpsoftrast.numthreads;
4818
4819         if (dpsoftrast.usethreads)
4820         {
4821                 int i;
4822                 DPSOFTRAST_Draw_SyncCommands();
4823                 for (i = 0; i < dpsoftrast.numthreads; i++)
4824                 {
4825                         DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
4826                         if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
4827                                 Thread_CondSignal(thread->drawcond);
4828                 }
4829         }
4830         else
4831         {
4832                 DPSOFTRAST_Draw_FlushThreads();
4833         }
4834 }
4835  
4836 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
4837 {
4838         int commandoffset = thread->commandoffset;
4839         while (commandoffset != endoffset)
4840         {
4841                 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
4842                 switch (command->opcode)
4843                 {
4844 #define INTERPCOMMAND(name) \
4845                 case DPSOFTRAST_OPCODE_##name : \
4846                         DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
4847                         commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
4848                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
4849                                 commandoffset = 0; \
4850                         break;
4851                 INTERPCOMMAND(Viewport)
4852                 INTERPCOMMAND(ClearColor)
4853                 INTERPCOMMAND(ClearDepth)
4854                 INTERPCOMMAND(ColorMask)
4855                 INTERPCOMMAND(DepthTest)
4856                 INTERPCOMMAND(ScissorTest)
4857                 INTERPCOMMAND(Scissor)
4858                 INTERPCOMMAND(BlendFunc)
4859                 INTERPCOMMAND(BlendSubtract)
4860                 INTERPCOMMAND(DepthMask)
4861                 INTERPCOMMAND(DepthFunc)
4862                 INTERPCOMMAND(DepthRange)
4863                 INTERPCOMMAND(PolygonOffset)
4864                 INTERPCOMMAND(CullFace)
4865                 INTERPCOMMAND(AlphaTest)
4866                 INTERPCOMMAND(AlphaFunc)
4867                 INTERPCOMMAND(SetTexture)
4868                 INTERPCOMMAND(SetShader)
4869                 INTERPCOMMAND(Uniform4f)
4870                 INTERPCOMMAND(UniformMatrix4f)
4871                 INTERPCOMMAND(Uniform1i)
4872
4873                 case DPSOFTRAST_OPCODE_Draw:
4874                         DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
4875                         commandoffset += command->commandsize;
4876                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
4877                                 commandoffset = 0;
4878                         thread->commandoffset = commandoffset;
4879                         break;
4880
4881                 case DPSOFTRAST_OPCODE_Reset:
4882                         commandoffset = 0;
4883                         break;
4884                 }
4885         }
4886         thread->commandoffset = commandoffset;
4887 }
4888
4889 static int DPSOFTRAST_Draw_Thread(void *data)
4890 {
4891         DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
4892         while(thread->index >= 0)
4893         {
4894                 if (thread->commandoffset != dpsoftrast.drawcommand)
4895                 {
4896                         DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);      
4897                 }
4898                 else 
4899                 {
4900                         Thread_LockMutex(thread->drawmutex);
4901                         if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
4902                         {
4903                                 if (thread->waiting) Thread_CondSignal(thread->waitcond);
4904                                 thread->starving = true;
4905                                 Thread_CondWait(thread->drawcond, thread->drawmutex);
4906                                 thread->starving = false;
4907                         }
4908                         Thread_UnlockMutex(thread->drawmutex);
4909                 }
4910         }   
4911         return 0;
4912 }
4913
4914 static void DPSOFTRAST_Draw_FlushThreads(void)
4915 {
4916         DPSOFTRAST_State_Thread *thread;
4917         int i;
4918         DPSOFTRAST_Draw_SyncCommands();
4919         if (dpsoftrast.usethreads) 
4920         {
4921                 for (i = 0; i < dpsoftrast.numthreads; i++)
4922                 {
4923                         thread = &dpsoftrast.threads[i];
4924                         if (thread->commandoffset != dpsoftrast.drawcommand)
4925                         {
4926                                 Thread_LockMutex(thread->drawmutex);
4927                                 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
4928                                         Thread_CondSignal(thread->drawcond);
4929                                 Thread_UnlockMutex(thread->drawmutex);
4930                         }
4931                 }
4932                 for (i = 0; i < dpsoftrast.numthreads; i++)
4933                 {
4934                         thread = &dpsoftrast.threads[i];
4935                         if (thread->commandoffset != dpsoftrast.drawcommand)
4936                         {
4937                                 Thread_LockMutex(thread->drawmutex);
4938                                 if (thread->commandoffset != dpsoftrast.drawcommand)
4939                                 {
4940                                         thread->waiting = true;
4941                                         Thread_CondWait(thread->waitcond, thread->drawmutex);
4942                                         thread->waiting = false;
4943                                 }
4944                                 Thread_UnlockMutex(thread->drawmutex);
4945                         }
4946                 }
4947         }
4948         else
4949         {
4950                 for (i = 0; i < dpsoftrast.numthreads; i++)
4951                 {
4952                         thread = &dpsoftrast.threads[i];
4953                         if (thread->commandoffset != dpsoftrast.drawcommand)
4954                                 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
4955                 }
4956         }
4957         dpsoftrast.commandpool.usedcommands = 0;
4958 }
4959
4960 void DPSOFTRAST_Flush(void)
4961 {
4962         DPSOFTRAST_Draw_FlushThreads();
4963 }
4964
4965 void DPSOFTRAST_Finish(void)
4966 {
4967         DPSOFTRAST_Flush();
4968 }
4969
4970 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
4971 {
4972         int i;
4973         union
4974         {
4975                 int i;
4976                 unsigned char b[4];
4977         }
4978         u;
4979         u.i = 1;
4980         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
4981         dpsoftrast.bigendian = u.b[3];
4982         dpsoftrast.fb_width = width;
4983         dpsoftrast.fb_height = height;
4984         dpsoftrast.fb_depthpixels = depthpixels;
4985         dpsoftrast.fb_colorpixels[0] = colorpixels;
4986         dpsoftrast.fb_colorpixels[1] = NULL;
4987         dpsoftrast.fb_colorpixels[1] = NULL;
4988         dpsoftrast.fb_colorpixels[1] = NULL;
4989         dpsoftrast.viewport[0] = 0;
4990         dpsoftrast.viewport[1] = 0;
4991         dpsoftrast.viewport[2] = dpsoftrast.fb_width;
4992         dpsoftrast.viewport[3] = dpsoftrast.fb_height;
4993         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
4994         dpsoftrast.texture_firstfree = 1;
4995         dpsoftrast.texture_end = 1;
4996         dpsoftrast.texture_max = 0;
4997         dpsoftrast.color[0] = 1;
4998         dpsoftrast.color[1] = 1;
4999         dpsoftrast.color[2] = 1;
5000         dpsoftrast.color[3] = 1;
5001         dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5002         dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5003         dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5004         dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5005         for (i = 0; i < dpsoftrast.numthreads; i++)
5006         {
5007                 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5008                 thread->index = i;
5009                 thread->cullface = GL_BACK;
5010                 thread->colormask[1] = 1;
5011                 thread->colormask[2] = 1;
5012                 thread->colormask[3] = 1;
5013                 thread->blendfunc[0] = GL_ONE;
5014                 thread->blendfunc[1] = GL_ZERO;
5015                 thread->depthmask = true;
5016                 thread->depthtest = true;
5017                 thread->depthfunc = GL_LEQUAL;
5018                 thread->scissortest = false;
5019                 thread->alphatest = false;
5020                 thread->alphafunc = GL_GREATER;
5021                 thread->alphavalue = 0.5f;
5022                 thread->viewport[0] = 0;
5023                 thread->viewport[1] = 0;
5024                 thread->viewport[2] = dpsoftrast.fb_width;
5025                 thread->viewport[3] = dpsoftrast.fb_height;
5026                 thread->scissor[0] = 0;
5027                 thread->scissor[1] = 0;
5028                 thread->scissor[2] = dpsoftrast.fb_width;
5029                 thread->scissor[3] = dpsoftrast.fb_height;
5030                 thread->depthrange[0] = 0;
5031                 thread->depthrange[1] = 1;
5032                 thread->polygonoffset[0] = 0;
5033                 thread->polygonoffset[1] = 0;
5034         
5035                 if (dpsoftrast.interlace)
5036                 {
5037                         thread->miny1 = (i*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5038                         thread->maxy1 = ((i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5039                         thread->miny2 = ((dpsoftrast.numthreads+i)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5040                         thread->maxy2 = ((dpsoftrast.numthreads+i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5041                 }
5042                 else
5043                 {
5044                         thread->miny1 = thread->miny2 = (i*dpsoftrast.fb_height)/dpsoftrast.numthreads;
5045                         thread->maxy1 = thread->maxy2 = ((i+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
5046                 }
5047
5048                 thread->numspans = 0;
5049                 thread->numtriangles = 0;
5050                 thread->commandoffset = 0;
5051                 thread->waiting = false;
5052                 thread->starving = false;
5053            
5054                 thread->validate = -1;
5055                 DPSOFTRAST_Validate(thread, -1);
5056  
5057                 if (dpsoftrast.usethreads)
5058                 {
5059                         thread->waitcond = Thread_CreateCond();
5060                         thread->drawcond = Thread_CreateCond();
5061                         thread->drawmutex = Thread_CreateMutex();
5062                         thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5063                 }
5064         }
5065         return 0;
5066 }
5067
5068 void DPSOFTRAST_Shutdown(void)
5069 {
5070         int i;
5071         if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5072         {
5073                 DPSOFTRAST_State_Thread *thread;
5074                 for (i = 0; i < dpsoftrast.numthreads; i++)
5075                 {
5076                         thread = &dpsoftrast.threads[i];
5077                         Thread_LockMutex(thread->drawmutex);
5078                         thread->index = -1;
5079                         Thread_CondSignal(thread->drawcond);
5080                         Thread_UnlockMutex(thread->drawmutex);
5081                         Thread_WaitThread(thread->thread, 0);
5082                         Thread_DestroyCond(thread->waitcond);
5083                         Thread_DestroyCond(thread->drawcond);
5084                         Thread_DestroyMutex(thread->drawmutex);
5085                 }
5086         }
5087         for (i = 0;i < dpsoftrast.texture_end;i++)
5088                 if (dpsoftrast.texture[i].bytes)
5089                         MM_FREE(dpsoftrast.texture[i].bytes);
5090         if (dpsoftrast.texture)
5091                 free(dpsoftrast.texture);
5092         if (dpsoftrast.threads)
5093                 MM_FREE(dpsoftrast.threads);
5094         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5095 }
5096