]> git.xonotic.org Git - xonotic/darkplaces.git/blob - dpsoftrast.c
b43fa29b09495fa56e9edb14721490b9bb1fd5ea
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "dpsoftrast.h"
7
8 #ifdef USE_SDL
9 #define USE_THREADS
10 #endif
11
12 #ifndef __cplusplus
13 typedef qboolean bool;
14 #endif
15
16 #define ALIGN_SIZE 16
17 #define ATOMIC_SIZE 32
18
19 #ifdef SSE2_PRESENT
20         #if defined(__GNUC__)
21                 #define ALIGN(var) var __attribute__((__aligned__(16)))
22                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
23                 #ifdef USE_THREADS
24                         #define MEMORY_BARRIER (_mm_sfence())
25                         //(__sync_synchronize())
26                         #define ATOMIC_COUNTER volatile int
27                         #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
28                         #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
29                         #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
30                 #endif
31         #elif defined(_MSC_VER)
32                 #define ALIGN(var) __declspec(align(16)) var
33                 #define ATOMIC(var) __declspec(align(32)) var
34                 #ifdef USE_THREADS
35                         #define MEMORY_BARRIER (_mm_sfence())
36                         //(MemoryBarrier())
37                         #define ATOMIC_COUNTER volatile LONG
38                         #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
39                         #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
40                         #define ATOMIC_ADD(counter, val) (InterlockedExchangeAdd(&(counter), (val)))
41                 #endif
42         #else
43                 #undef USE_THREADS
44                 #undef SSE2_PRESENT
45         #endif
46 #endif
47
48 #ifndef SSE2_PRESENT
49         #define ALIGN(var) var
50         #define ATOMIC(var) var
51 #endif
52
53 #ifdef USE_THREADS
54 #include <SDL.h>
55 #include <SDL_thread.h>
56 #else
57         #define MEMORY_BARRIER ((void)0)
58         #define ATOMIC_COUNTER int
59         #define ATOMIC_INCREMENT(counter) (++(counter))
60         #define ATOMIC_DECREMENT(counter) (--(counter))
61         #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
62         typedef void SDL_Thread;
63         typedef void SDL_cond;
64         typedef void SDL_mutex;
65 #endif
66
67 #ifdef SSE2_PRESENT
68 #include <emmintrin.h>
69
70 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
71
72 static void *MM_CALLOC(size_t nmemb, size_t size)
73 {
74         void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
75         if(ptr != NULL) memset(ptr, 0, nmemb*size);
76         return ptr;
77 }
78
79 #define MM_FREE _mm_free
80 #else
81 #define MM_MALLOC(size) malloc(size)
82 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
83 #define MM_FREE free
84 #endif
85
86 typedef enum DPSOFTRAST_ARRAY_e
87 {
88         DPSOFTRAST_ARRAY_POSITION,
89         DPSOFTRAST_ARRAY_COLOR,
90         DPSOFTRAST_ARRAY_TEXCOORD0,
91         DPSOFTRAST_ARRAY_TEXCOORD1,
92         DPSOFTRAST_ARRAY_TEXCOORD2,
93         DPSOFTRAST_ARRAY_TEXCOORD3,
94         DPSOFTRAST_ARRAY_TEXCOORD4,
95         DPSOFTRAST_ARRAY_TEXCOORD5,
96         DPSOFTRAST_ARRAY_TEXCOORD6,
97         DPSOFTRAST_ARRAY_TEXCOORD7,
98         DPSOFTRAST_ARRAY_TOTAL
99 }
100 DPSOFTRAST_ARRAY;
101
102 typedef struct DPSOFTRAST_Texture_s
103 {
104         int flags;
105         int width;
106         int height;
107         int depth;
108         int sides;
109         DPSOFTRAST_TEXTURE_FILTER filter;
110         int mipmaps;
111         int size;
112         ATOMIC_COUNTER binds;
113         unsigned char *bytes;
114         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
115 }
116 DPSOFTRAST_Texture;
117
118 #define COMMAND_SIZE ALIGN_SIZE
119 #define COMMAND_ALIGN(var) ALIGN(var)
120
121 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
122 {
123         unsigned char opcode;
124         unsigned short commandsize;
125 }
126 DPSOFTRAST_Command);
127
128 enum { DPSOFTRAST_OPCODE_Reset = 0 };
129
130 #define DEFCOMMAND(opcodeval, name, fields) \
131         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
132         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
133         { \
134                 unsigned char opcode; \
135                 unsigned short commandsize; \
136                 fields \
137         } DPSOFTRAST_Command_##name );
138
139 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
140 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
141
142 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
143 {
144         int freecommand;
145         int usedcommands;
146         ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
147 }
148 DPSOFTRAST_State_Command_Pool);
149
150 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
151 {
152         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
153         float w[3];
154         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
155 }
156 DPSOFTRAST_State_Triangle);
157
158 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
159         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
160         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
161                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
162                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
163 }
164 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
165         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
166         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
167         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
168         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
169         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
170         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
171         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
172         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
173 }
174                                         
175 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
176
177 typedef ALIGN(struct DPSOFTRAST_State_Span_s
178 {
179         int triangle; // triangle this span was generated by
180         int x; // framebuffer x coord
181         int y; // framebuffer y coord
182         int length; // pixel count
183         int startx; // usable range (according to pixelmask)
184         int endx; // usable range (according to pixelmask)
185         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
186 }
187 DPSOFTRAST_State_Span);
188
189 #define DPSOFTRAST_DRAW_MAXSPANS 1024
190 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
191
192 #define DPSOFTRAST_VALIDATE_FB 1
193 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
194 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
195 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
196
197 typedef enum DPSOFTRAST_BLENDMODE_e
198 {
199         DPSOFTRAST_BLENDMODE_OPAQUE,
200         DPSOFTRAST_BLENDMODE_ALPHA,
201         DPSOFTRAST_BLENDMODE_ADDALPHA,
202         DPSOFTRAST_BLENDMODE_ADD,
203         DPSOFTRAST_BLENDMODE_INVMOD,
204         DPSOFTRAST_BLENDMODE_MUL,
205         DPSOFTRAST_BLENDMODE_MUL2,
206         DPSOFTRAST_BLENDMODE_SUBALPHA,
207         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
208         DPSOFTRAST_BLENDMODE_TOTAL
209 }
210 DPSOFTRAST_BLENDMODE;
211
212 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
213 {
214         SDL_Thread *thread;
215         int index;
216         
217         int cullface;
218         int colormask[4];
219         int blendfunc[2];
220         int blendsubtract;
221         int depthmask;
222         int depthtest;
223         int depthfunc;
224         int scissortest;
225         int alphatest;
226         int alphafunc;
227         float alphavalue;
228         int viewport[4];
229         int scissor[4];
230         float depthrange[2];
231         float polygonoffset[2];
232
233         int shader_mode;
234         int shader_permutation;
235
236         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
237         
238         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
239         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
240
241         // DPSOFTRAST_VALIDATE_ flags
242         int validate;
243
244         // derived values (DPSOFTRAST_VALIDATE_FB)
245         int fb_colormask;
246         int fb_clearscissor[4];
247         ALIGN(float fb_viewportcenter[4]);
248         ALIGN(float fb_viewportscale[4]);
249
250         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
251         int fb_depthfunc;
252
253         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
254         int fb_blendmode;
255
256         ATOMIC(volatile int commandoffset);
257
258         volatile bool waiting;
259         volatile bool starving;
260         SDL_cond *waitcond;
261         SDL_cond *drawcond;
262         SDL_mutex *drawmutex;
263
264         int numspans;
265         int numtriangles;
266         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
267         DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
268 }
269 DPSOFTRAST_State_Thread);
270
271 typedef ATOMIC(struct DPSOFTRAST_State_s
272 {
273         int fb_width;
274         int fb_height;
275         unsigned int *fb_depthpixels;
276         unsigned int *fb_colorpixels[4];
277
278         int viewport[4];
279         ALIGN(float fb_viewportcenter[4]);
280         ALIGN(float fb_viewportscale[4]);
281
282         float color[4];
283         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
284         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
285
286         const float *pointer_vertex3f;
287         const float *pointer_color4f;
288         const unsigned char *pointer_color4ub;
289         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
290         int stride_vertex;
291         int stride_color;
292         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
293         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
294         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
295
296         int firstvertex;
297         int numvertices;
298         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
299         float *screencoord4f;
300         int drawstarty;
301         int drawendy;
302         int drawclipped;
303         
304         int shader_mode;
305         int shader_permutation;
306
307         int texture_max;
308         int texture_end;
309         int texture_firstfree;
310         DPSOFTRAST_Texture *texture;
311
312         int bigendian;
313
314         // error reporting
315         const char *errorstring;
316
317         int numthreads;
318         DPSOFTRAST_State_Thread *threads;
319
320         ATOMIC(volatile int drawcommand);
321
322         DPSOFTRAST_State_Command_Pool commandpool;
323 }
324 DPSOFTRAST_State);
325
326 DPSOFTRAST_State dpsoftrast;
327
328 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
329 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
330 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
331 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
332 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
333
334 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
335 {
336         fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
337         fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
338         fb_viewportcenter[3] = 0.5f;
339         fb_viewportcenter[0] = 0.0f;
340         fb_viewportscale[1] = 0.5f * viewport[2];
341         fb_viewportscale[2] = -0.5f * viewport[3];
342         fb_viewportscale[3] = 0.5f;
343         fb_viewportscale[0] = 1.0f;
344 }
345
346 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
347 {
348         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
349         // and viewport projection values
350         int x1, x2;
351         int y1, y2;
352         x1 = thread->scissor[0];
353         x2 = thread->scissor[0] + thread->scissor[2];
354         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
355         y2 = dpsoftrast.fb_height - thread->scissor[1];
356         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
357         if (x1 < 0) x1 = 0;
358         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
359         if (y1 < 0) y1 = 0;
360         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
361         thread->fb_clearscissor[0] = x1;
362         thread->fb_clearscissor[1] = y1;
363         thread->fb_clearscissor[2] = x2 - x1;
364         thread->fb_clearscissor[3] = y2 - y1;
365
366         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
367 }
368
369 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
370 {
371         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
372 }
373
374 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
375 {
376         if (thread->blendsubtract)
377         {
378                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
379                 {
380                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
381                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
382                 BLENDFUNC(GL_SRC_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
383                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
384                 }
385         }
386         else
387         {       
388                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
389                 {
390                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
391                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
392                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
393                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
394                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
395                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
396                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
397                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
398                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
399                 BLENDFUNC(GL_SRC_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
400                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
401                 }
402         }
403 }
404
405 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
406
407 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
408 {
409         mask &= thread->validate;
410         if (!mask)
411                 return;
412         if (mask & DPSOFTRAST_VALIDATE_FB)
413         {
414                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
415                 DPSOFTRAST_RecalcFB(thread);
416         }
417         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
418         {
419                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
420                 DPSOFTRAST_RecalcDepthFunc(thread);
421         }
422         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
423         {
424                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
425                 DPSOFTRAST_RecalcBlendFunc(thread);
426         }
427 }
428
429 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
430 {
431         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
432                 return &dpsoftrast.texture[index];
433         return NULL;
434 }
435
436 static void DPSOFTRAST_Texture_Grow(void)
437 {
438         DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
439         DPSOFTRAST_State_Thread *thread;
440         int i;
441         int j;
442         DPSOFTRAST_Flush();
443         // expand texture array as needed
444         if (dpsoftrast.texture_max < 1024)
445                 dpsoftrast.texture_max = 1024;
446         else
447                 dpsoftrast.texture_max *= 2;
448         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
449         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
450                 if(dpsoftrast.texbound[i])
451                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
452         for (j = 0; j < dpsoftrast.numthreads; j++)
453         {
454                 thread = &dpsoftrast.threads[j];
455                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
456                         if(thread->texbound[i])
457                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
458         }
459 }
460
461 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
462 {
463         int w;
464         int h;
465         int d;
466         int size;
467         int s;
468         int texnum;
469         int mipmaps;
470         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
471         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
472         DPSOFTRAST_Texture *texture;
473         if (width*height*depth < 1)
474         {
475                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
476                 return 0;
477         }
478         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
479         {
480                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
481                 return 0;
482         }
483         switch(texformat)
484         {
485         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
486         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
487         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
488                 break;
489         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
490                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
491                 {
492                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
493                         return 0;
494                 }
495                 if (depth != 1)
496                 {
497                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
498                         return 0;
499                 }
500                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
501                 {
502                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
503                         return 0;
504                 }
505                 break;
506         }
507         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
508         {
509                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
510                 return 0;
511         }
512         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
513         {
514                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
515                 return 0;
516         }
517         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
518         {
519                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
520                 return 0;
521         }
522         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
523         {
524                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
525                 return 0;
526         }
527         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
528         {
529                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
530                 return 0;
531         }
532         // find first empty slot in texture array
533         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
534                 if (!dpsoftrast.texture[texnum].bytes)
535                         break;
536         dpsoftrast.texture_firstfree = texnum + 1;
537         if (dpsoftrast.texture_max <= texnum)
538                 DPSOFTRAST_Texture_Grow();
539         if (dpsoftrast.texture_end <= texnum)
540                 dpsoftrast.texture_end = texnum + 1;
541         texture = &dpsoftrast.texture[texnum];
542         memset(texture, 0, sizeof(*texture));
543         texture->flags = flags;
544         texture->width = width;
545         texture->height = height;
546         texture->depth = depth;
547         texture->sides = sides;
548         texture->binds = 0;
549         w = width;
550         h = height;
551         d = depth;
552         size = 0;
553         mipmaps = 0;
554         w = width;
555         h = height;
556         d = depth;
557         for (;;)
558         {
559                 s = w * h * d * sides * 4;
560                 texture->mipmap[mipmaps][0] = size;
561                 texture->mipmap[mipmaps][1] = s;
562                 texture->mipmap[mipmaps][2] = w;
563                 texture->mipmap[mipmaps][3] = h;
564                 texture->mipmap[mipmaps][4] = d;
565                 size += s;
566                 mipmaps++;
567                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
568                         break;
569                 if (w > 1) w >>= 1;
570                 if (h > 1) h >>= 1;
571                 if (d > 1) d >>= 1;
572         }
573         texture->mipmaps = mipmaps;
574         texture->size = size;
575
576         // allocate the pixels now
577         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
578
579         return texnum;
580 }
581 void DPSOFTRAST_Texture_Free(int index)
582 {
583         DPSOFTRAST_Texture *texture;
584         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
585         if (texture->binds)
586                 DPSOFTRAST_Flush();
587         if (texture->bytes)
588                 MM_FREE(texture->bytes);
589         texture->bytes = NULL;
590         memset(texture, 0, sizeof(*texture));
591         // adjust the free range and used range
592         if (dpsoftrast.texture_firstfree > index)
593                 dpsoftrast.texture_firstfree = index;
594         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
595                 dpsoftrast.texture_end--;
596 }
597 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
598 {
599         int i, x, y, z, w, layer0, layer1, row0, row1;
600         unsigned char *o, *i0, *i1, *i2, *i3;
601         DPSOFTRAST_Texture *texture;
602         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
603         if (texture->mipmaps <= 1)
604                 return;
605         for (i = 1;i < texture->mipmaps;i++)
606         {
607                 for (z = 0;z < texture->mipmap[i][4];z++)
608                 {
609                         layer0 = z*2;
610                         layer1 = z*2+1;
611                         if (layer1 >= texture->mipmap[i-1][4])
612                                 layer1 = texture->mipmap[i-1][4]-1;
613                         for (y = 0;y < texture->mipmap[i][3];y++)
614                         {
615                                 row0 = y*2;
616                                 row1 = y*2+1;
617                                 if (row1 >= texture->mipmap[i-1][3])
618                                         row1 = texture->mipmap[i-1][3]-1;
619                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
620                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
621                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
622                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
623                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
624                                 w = texture->mipmap[i][2];
625                                 if (layer1 > layer0)
626                                 {
627                                         if (texture->mipmap[i-1][2] > 1)
628                                         {
629                                                 // average 3D texture
630                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
631                                                 {
632                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
633                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
634                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
635                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
636                                                 }
637                                         }
638                                         else
639                                         {
640                                                 // average 3D mipmap with parent width == 1
641                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
642                                                 {
643                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
644                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
645                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
646                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
647                                                 }
648                                         }
649                                 }
650                                 else
651                                 {
652                                         if (texture->mipmap[i-1][2] > 1)
653                                         {
654                                                 // average 2D texture (common case)
655                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
656                                                 {
657                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
658                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
659                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
660                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
661                                                 }
662                                         }
663                                         else
664                                         {
665                                                 // 2D texture with parent width == 1
666                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
667                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
668                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
669                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
670                                         }
671                                 }
672                         }
673                 }
674         }
675 }
676 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
677 {
678         DPSOFTRAST_Texture *texture;
679         unsigned char *dst;
680         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
681         if (texture->binds)
682                 DPSOFTRAST_Flush();
683         dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
684         while (blockheight > 0)
685         {
686                 memcpy(dst, pixels, blockwidth * 4);
687                 pixels += blockwidth * 4;
688                 dst += texture->mipmap[0][2] * 4;
689                 blockheight--;
690         }
691         DPSOFTRAST_Texture_CalculateMipmaps(index);
692 }
693 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
694 {
695         DPSOFTRAST_Texture *texture;
696         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
697         if (texture->binds)
698                 DPSOFTRAST_Flush();
699         memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
700         DPSOFTRAST_Texture_CalculateMipmaps(index);
701 }
702 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
703 {
704         DPSOFTRAST_Texture *texture;
705         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
706         return texture->mipmap[mip][2];
707 }
708 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
709 {
710         DPSOFTRAST_Texture *texture;
711         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
712         return texture->mipmap[mip][3];
713 }
714 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
715 {
716         DPSOFTRAST_Texture *texture;
717         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
718         return texture->mipmap[mip][4];
719 }
720 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
721 {
722         DPSOFTRAST_Texture *texture;
723         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
724         if (texture->binds)
725                 DPSOFTRAST_Flush();
726         return texture->bytes + texture->mipmap[mip][0];
727 }
728 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
729 {
730         DPSOFTRAST_Texture *texture;
731         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
732         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
733         {
734                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
735                 return;
736         }
737         if (texture->binds)
738                 DPSOFTRAST_Flush();
739         texture->filter = filter;
740 }
741
742 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
743 {
744         if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
745                 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
746                 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
747                 DPSOFTRAST_Flush();
748         dpsoftrast.fb_width = width;
749         dpsoftrast.fb_height = height;
750         dpsoftrast.fb_depthpixels = depthpixels;
751         dpsoftrast.fb_colorpixels[0] = colorpixels0;
752         dpsoftrast.fb_colorpixels[1] = colorpixels1;
753         dpsoftrast.fb_colorpixels[2] = colorpixels2;
754         dpsoftrast.fb_colorpixels[3] = colorpixels3;
755 }
756
757 static void DPSOFTRAST_Draw_FlushThreads(void);
758
759 static void DPSOFTRAST_Draw_SyncCommands(void)
760 {
761         MEMORY_BARRIER;
762         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
763 }
764
765 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
766 {
767 #ifdef USE_THREADS
768         DPSOFTRAST_State_Thread *thread;
769         int i;
770         int freecommand = dpsoftrast.commandpool.freecommand;
771         int usedcommands = dpsoftrast.commandpool.usedcommands;
772         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
773                 return;
774         DPSOFTRAST_Draw_SyncCommands();
775         for(;;)
776         {
777                 int waitindex = -1;
778                 int commandoffset;
779                 usedcommands = 0;
780                 for (i = 0; i < dpsoftrast.numthreads; i++)
781                 {
782                         thread = &dpsoftrast.threads[i]; 
783                         commandoffset = freecommand - thread->commandoffset;
784                         if (commandoffset < 0)
785                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
786                         if (commandoffset > usedcommands)
787                         {
788                                 waitindex = i;
789                                 usedcommands = commandoffset;
790                         }
791                 }
792                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
793                         break;
794                 thread = &dpsoftrast.threads[waitindex];
795                 SDL_LockMutex(thread->drawmutex);
796                 if (thread->commandoffset != dpsoftrast.drawcommand)
797                 {
798                         thread->waiting = true;
799                         if (thread->starving) SDL_CondSignal(thread->drawcond);
800                         SDL_CondWait(thread->waitcond, thread->drawmutex);
801                         thread->waiting = false;
802                 }
803                 SDL_UnlockMutex(thread->drawmutex);
804         }
805         dpsoftrast.commandpool.usedcommands = usedcommands;
806 #else
807         DPSOFTRAST_Draw_FlushThreads();
808 #endif
809 }
810
811 #define DPSOFTRAST_ALIGNCOMMAND(size) \
812         ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
813 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
814         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
815
816 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
817 {
818         DPSOFTRAST_Command *command;
819         int freecommand = dpsoftrast.commandpool.freecommand;
820         int usedcommands = dpsoftrast.commandpool.usedcommands;
821         int extra = sizeof(DPSOFTRAST_Command);
822         if(DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
823                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
824         if(usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
825         {
826                 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
827                 freecommand = dpsoftrast.commandpool.freecommand;
828                 usedcommands = dpsoftrast.commandpool.usedcommands;
829         }
830         if(DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
831         {
832                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
833                 command->opcode = DPSOFTRAST_OPCODE_Reset;
834                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
835                 freecommand = 0;
836         }
837         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
838         command->opcode = opcode;
839         command->commandsize = size;
840         freecommand += size;
841         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
842                 freecommand = 0;
843         dpsoftrast.commandpool.freecommand = freecommand;
844         dpsoftrast.commandpool.usedcommands = usedcommands + size;
845         return command;
846 }
847
848 static void DPSOFTRAST_UndoCommand(int size)
849 {
850         int freecommand = dpsoftrast.commandpool.freecommand;
851         int usedcommands = dpsoftrast.commandpool.usedcommands;
852         freecommand -= size;
853         if (freecommand < 0)
854                 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
855         usedcommands -= size;
856         dpsoftrast.commandpool.freecommand = freecommand;
857         dpsoftrast.commandpool.usedcommands = usedcommands;
858 }
859                 
860 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
861 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
862 {
863         thread->viewport[0] = command->x;
864         thread->viewport[1] = command->y;
865         thread->viewport[2] = command->width;
866         thread->viewport[3] = command->height;
867         thread->validate |= DPSOFTRAST_VALIDATE_FB;
868 }
869 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
870 {
871         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
872         command->x = x;
873         command->y = y;
874         command->width = width;
875         command->height = height;
876
877         dpsoftrast.viewport[0] = x;
878         dpsoftrast.viewport[1] = y;
879         dpsoftrast.viewport[2] = width;
880         dpsoftrast.viewport[3] = height;
881         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
882 }
883
884 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
885 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
886 {
887         int i, x1, y1, x2, y2, w, h, x, y, t1, t2;
888         unsigned int *p;
889         unsigned int c;
890         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
891         x1 = thread->fb_clearscissor[0];
892         y1 = thread->fb_clearscissor[1];
893         x2 = thread->fb_clearscissor[0] + thread->fb_clearscissor[2];
894         y2 = thread->fb_clearscissor[1] + thread->fb_clearscissor[3];
895         t1 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
896         t2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
897         if(y1 < t1) y1 = t1;
898         if(y2 > t2) y2 = t2;
899         w = x2 - x1;
900         h = y2 - y1;
901         if (w < 1 || h < 1)
902                 return;
903         // FIXME: honor fb_colormask?
904         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
905         for (i = 0;i < 4;i++)
906         {
907                 if (!dpsoftrast.fb_colorpixels[i])
908                         continue;
909                 for (y = y1;y < y2;y++)
910                 {
911                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
912                         for (x = x1;x < x2;x++)
913                                 p[x] = c;
914                 }
915         }
916 }
917 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
918 {
919         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
920         command->r = r;
921         command->g = g;
922         command->b = b;
923         command->a = a;
924 }
925
926 DEFCOMMAND(3, ClearDepth, float depth;)
927 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
928 {
929         int x1, y1, x2, y2, w, h, x, y, t1, t2;
930         unsigned int *p;
931         unsigned int c;
932         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
933         x1 = thread->fb_clearscissor[0];
934         y1 = thread->fb_clearscissor[1];
935         x2 = thread->fb_clearscissor[0] + thread->fb_clearscissor[2];
936         y2 = thread->fb_clearscissor[1] + thread->fb_clearscissor[3];
937         t1 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
938         t2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
939         if(y1 < t1) y1 = t1;
940         if(y2 > t2) y2 = t2;
941         w = x2 - x1;
942         h = y2 - y1;
943         if (w < 1 || h < 1)
944                 return;
945         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
946         for (y = y1;y < y2;y++)
947         {
948                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
949                 for (x = x1;x < x2;x++)
950                         p[x] = c;
951         }
952 }
953 void DPSOFTRAST_ClearDepth(float d)
954 {
955         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
956         command->depth = d;
957 }
958
959 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
960 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
961 {
962         thread->colormask[0] = command->r != 0;
963         thread->colormask[1] = command->g != 0;
964         thread->colormask[2] = command->b != 0;
965         thread->colormask[3] = command->a != 0;
966         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
967 }
968 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
969 {
970         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
971         command->r = r;
972         command->g = g;
973         command->b = b;
974         command->a = a;
975 }
976
977 DEFCOMMAND(5, DepthTest, int enable;)
978 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
979 {
980         thread->depthtest = command->enable;
981         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
982 }
983 void DPSOFTRAST_DepthTest(int enable)
984 {
985         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
986         command->enable = enable;
987 }
988
989 DEFCOMMAND(6, ScissorTest, int enable;)
990 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
991 {
992         thread->scissortest = command->enable;
993         thread->validate |= DPSOFTRAST_VALIDATE_FB;
994 }
995 void DPSOFTRAST_ScissorTest(int enable)
996 {
997         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
998         command->enable = enable;
999 }
1000
1001 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1002 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1003 {
1004         thread->scissor[0] = command->x;
1005         thread->scissor[1] = command->y;
1006         thread->scissor[2] = command->width;
1007         thread->scissor[3] = command->height;
1008         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1009 }
1010 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1011 {
1012         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1013         command->x = x;
1014         command->y = y;
1015         command->width = width;
1016         command->height = height;
1017 }
1018
1019 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1020 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1021 {
1022         thread->blendfunc[0] = command->sfactor;
1023         thread->blendfunc[1] = command->dfactor;
1024         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1025 }
1026 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1027 {
1028         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1029         command->sfactor = sfactor;
1030         command->dfactor = dfactor;
1031 }
1032
1033 DEFCOMMAND(9, BlendSubtract, int enable;)
1034 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1035 {
1036         thread->blendsubtract = command->enable;
1037         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1038 }
1039 void DPSOFTRAST_BlendSubtract(int enable)
1040 {
1041         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1042         command->enable = enable;
1043 }
1044
1045 DEFCOMMAND(10, DepthMask, int enable;)
1046 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1047 {
1048         thread->depthmask = command->enable;
1049 }
1050 void DPSOFTRAST_DepthMask(int enable)
1051 {
1052         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1053         command->enable = enable;
1054 }
1055
1056 DEFCOMMAND(11, DepthFunc, int func;)
1057 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1058 {
1059         thread->depthfunc = command->func;
1060 }
1061 void DPSOFTRAST_DepthFunc(int func)
1062 {
1063         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1064         command->func = func;
1065 }
1066
1067 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1068 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1069 {
1070         thread->depthrange[0] = command->nearval;
1071         thread->depthrange[1] = command->farval;
1072 }
1073 void DPSOFTRAST_DepthRange(float nearval, float farval)
1074 {
1075         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1076         command->nearval = nearval;
1077         command->farval = farval;
1078 }
1079
1080 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1081 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1082 {
1083         thread->polygonoffset[0] = command->alongnormal;
1084         thread->polygonoffset[1] = command->intoview;
1085 }
1086 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1087 {
1088         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1089         command->alongnormal = alongnormal;
1090         command->intoview = intoview;
1091 }
1092
1093 DEFCOMMAND(14, CullFace, int mode;)
1094 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1095 {
1096         thread->cullface = command->mode;
1097 }
1098 void DPSOFTRAST_CullFace(int mode)
1099 {
1100         DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1101         command->mode = mode;
1102 }
1103
1104 DEFCOMMAND(15, AlphaTest, int enable;)
1105 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1106 {
1107         thread->alphatest = command->enable;
1108 }
1109 void DPSOFTRAST_AlphaTest(int enable)
1110 {
1111         DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1112         command->enable = enable;
1113 }
1114
1115 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1116 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1117 {
1118         thread->alphafunc = command->func;
1119         thread->alphavalue = command->ref;
1120 }
1121 void DPSOFTRAST_AlphaFunc(int func, float ref)
1122 {
1123         DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1124         command->func = func;
1125         command->ref = ref;
1126 }
1127
1128 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1129 {
1130         dpsoftrast.color[0] = r;
1131         dpsoftrast.color[1] = g;
1132         dpsoftrast.color[2] = b;
1133         dpsoftrast.color[3] = a;
1134 }
1135
1136 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1137 {
1138         int outstride = blockwidth * 4;
1139         int instride = dpsoftrast.fb_width * 4;
1140         int bx1 = blockx;
1141         int by1 = blocky;
1142         int bx2 = blockx + blockwidth;
1143         int by2 = blocky + blockheight;
1144         int bw;
1145         int bh;
1146         int x;
1147         int y;
1148         unsigned char *inpixels;
1149         unsigned char *b;
1150         unsigned char *o;
1151         DPSOFTRAST_Flush();
1152         if (bx1 < 0) bx1 = 0;
1153         if (by1 < 0) by1 = 0;
1154         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1155         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1156         bw = bx2 - bx1;
1157         bh = by2 - by1;
1158         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1159         if (dpsoftrast.bigendian)
1160         {
1161                 for (y = by1;y < by2;y++)
1162                 {
1163                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1164                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1165                         for (x = bx1;x < bx2;x++)
1166                         {
1167                                 o[0] = b[3];
1168                                 o[1] = b[2];
1169                                 o[2] = b[1];
1170                                 o[3] = b[0];
1171                                 o += 4;
1172                                 b += 4;
1173                         }
1174                 }
1175         }
1176         else
1177         {
1178                 for (y = by1;y < by2;y++)
1179                 {
1180                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1181                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1182                         memcpy(o, b, bw*4);
1183                 }
1184         }
1185
1186 }
1187 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1188 {
1189         int tx1 = tx;
1190         int ty1 = ty;
1191         int tx2 = tx + width;
1192         int ty2 = ty + height;
1193         int sx1 = sx;
1194         int sy1 = sy;
1195         int sx2 = sx + width;
1196         int sy2 = sy + height;
1197         int swidth;
1198         int sheight;
1199         int twidth;
1200         int theight;
1201         int sw;
1202         int sh;
1203         int tw;
1204         int th;
1205         int y;
1206         unsigned int *spixels;
1207         unsigned int *tpixels;
1208         DPSOFTRAST_Texture *texture;
1209         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1210         if (mip < 0 || mip >= texture->mipmaps) return;
1211         if (texture->binds)
1212                 DPSOFTRAST_Flush();
1213         spixels = dpsoftrast.fb_colorpixels[0];
1214         swidth = dpsoftrast.fb_width;
1215         sheight = dpsoftrast.fb_height;
1216         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1217         twidth = texture->mipmap[mip][2];
1218         theight = texture->mipmap[mip][3];
1219         if (tx1 < 0) tx1 = 0;
1220         if (ty1 < 0) ty1 = 0;
1221         if (tx2 > twidth) tx2 = twidth;
1222         if (ty2 > theight) ty2 = theight;
1223         if (sx1 < 0) sx1 = 0;
1224         if (sy1 < 0) sy1 = 0;
1225         if (sx2 > swidth) sx2 = swidth;
1226         if (sy2 > sheight) sy2 = sheight;
1227         tw = tx2 - tx1;
1228         th = ty2 - ty1;
1229         sw = sx2 - sx1;
1230         sh = sy2 - sy1;
1231         if (tw > sw) tw = sw;
1232         if (th > sh) th = sh;
1233         if (tw < 1 || th < 1)
1234                 return;
1235         for (y = 0;y < th;y++)
1236                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 + y) * swidth + sx1), tw*4);
1237         if (texture->mipmaps > 1)
1238                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1239 }
1240
1241 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1242 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1243 {
1244         if (thread->texbound[command->unitnum])
1245                 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1246         thread->texbound[command->unitnum] = command->texture;
1247 }
1248 void DPSOFTRAST_SetTexture(int unitnum, int index)
1249 {
1250         DPSOFTRAST_Command_SetTexture *command;
1251         DPSOFTRAST_Texture *texture;
1252         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1253         {
1254                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1255                 return;
1256         }
1257         texture = DPSOFTRAST_Texture_GetByIndex(index);
1258         if (index && !texture)
1259         {
1260                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1261                 return;
1262         }
1263
1264         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1265         command->unitnum = unitnum;
1266         command->texture = texture;
1267
1268         dpsoftrast.texbound[unitnum] = texture;
1269         ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1270 }
1271
1272 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1273 {
1274         dpsoftrast.pointer_vertex3f = vertex3f;
1275         dpsoftrast.stride_vertex = stride;
1276 }
1277 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1278 {
1279         dpsoftrast.pointer_color4f = color4f;
1280         dpsoftrast.pointer_color4ub = NULL;
1281         dpsoftrast.stride_color = stride;
1282 }
1283 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1284 {
1285         dpsoftrast.pointer_color4f = NULL;
1286         dpsoftrast.pointer_color4ub = color4ub;
1287         dpsoftrast.stride_color = stride;
1288 }
1289 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1290 {
1291         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1292         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1293         dpsoftrast.stride_texcoord[unitnum] = stride;
1294 }
1295
1296 DEFCOMMAND(18, SetShader, int mode; int permutation;)
1297 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1298 {
1299         thread->shader_mode = command->mode;
1300         thread->shader_permutation = command->permutation;
1301 }
1302 void DPSOFTRAST_SetShader(int mode, int permutation)
1303 {
1304         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1305         command->mode = mode;
1306         command->permutation = permutation;
1307
1308         dpsoftrast.shader_mode = mode;
1309         dpsoftrast.shader_permutation = permutation;
1310 }
1311
1312 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1313 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1314 {
1315         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1316 }
1317 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1318 {
1319         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1320         command->index = index;
1321         command->val[0] = v0;
1322         command->val[1] = v1;
1323         command->val[2] = v2;
1324         command->val[3] = v3;
1325
1326         dpsoftrast.uniform4f[index*4+0] = v0;
1327         dpsoftrast.uniform4f[index*4+1] = v1;
1328         dpsoftrast.uniform4f[index*4+2] = v2;
1329         dpsoftrast.uniform4f[index*4+3] = v3;
1330 }
1331 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1332 {
1333         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1334         command->index = index;
1335         memcpy(command->val, v, sizeof(command->val));
1336
1337         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1338 }
1339
1340 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1341 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1342 {
1343         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1344 }
1345 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1346 {
1347 #ifdef SSE2_PRESENT
1348         int i, index;
1349         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1350         {
1351                 __m128 m0, m1, m2, m3;
1352                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1353                 command->index = index;
1354                 if (((size_t)v)&(ALIGN_SIZE-1))
1355                 {
1356                         m0 = _mm_loadu_ps(v);
1357                         m1 = _mm_loadu_ps(v+4);
1358                         m2 = _mm_loadu_ps(v+8);
1359                         m3 = _mm_loadu_ps(v+12);
1360                 }
1361                 else
1362                 {
1363                         m0 = _mm_load_ps(v);
1364                         m1 = _mm_load_ps(v+4);
1365                         m2 = _mm_load_ps(v+8);
1366                         m3 = _mm_load_ps(v+12);
1367                 }
1368                 if (transpose)
1369                 {
1370                         __m128 t0, t1, t2, t3;
1371                         t0 = _mm_unpacklo_ps(m0, m1);
1372                         t1 = _mm_unpacklo_ps(m2, m3);
1373                         t2 = _mm_unpackhi_ps(m0, m1);
1374                         t3 = _mm_unpackhi_ps(m2, m3);
1375                         m0 = _mm_movelh_ps(t0, t1);
1376                         m1 = _mm_movehl_ps(t1, t0);
1377                         m2 = _mm_movelh_ps(t2, t3);
1378                         m3 = _mm_movehl_ps(t3, t2);                     
1379                 }
1380                 _mm_store_ps(command->val, m0);
1381                 _mm_store_ps(command->val+4, m1);
1382                 _mm_store_ps(command->val+8, m2);
1383                 _mm_store_ps(command->val+12, m3);
1384                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1385                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1386                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1387                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1388         }
1389 #endif
1390 }
1391
1392 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1393 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1394 {
1395         thread->uniform1i[command->index] = command->val;
1396 }
1397 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1398 {
1399         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1400         command->index = index;
1401         command->val = i0;
1402
1403         dpsoftrast.uniform1i[command->index] = i0;
1404 }
1405
1406 #ifdef SSE2_PRESENT
1407 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1408 {
1409         float *end = dst + size*4;
1410         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1411         {
1412                 while (dst < end)
1413                 {
1414                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1415                         dst += 4;
1416                         src += stride;
1417                 }
1418         }
1419         else
1420         {
1421                 while (dst < end)
1422                 {
1423                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1424                         dst += 4;
1425                         src += stride;
1426                 }
1427         }
1428 }
1429
1430 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1431 {
1432         float *end = dst + size*4;
1433         if (stride == sizeof(float[3]))
1434         {
1435                 float *end4 = dst + (size&~3)*4;        
1436                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1437                 {
1438                         while (dst < end4)
1439                         {
1440                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1441                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1442                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1443                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1444                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1445                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1446                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1447                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1448                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1449                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1450                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1451                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1452                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1453                                 dst += 16;
1454                                 src += 4*sizeof(float[3]);
1455                         }
1456                 }
1457                 else
1458                 {
1459                         while (dst < end4)
1460                         {
1461                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1462                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1463                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1464                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1465                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1466                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1467                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1468                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1469                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1470                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1471                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1472                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1473                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1474                                 dst += 16;
1475                                 src += 4*sizeof(float[3]);
1476                         }
1477                 }
1478         }
1479         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1480         {
1481                 while (dst < end)
1482                 {
1483                         __m128 v = _mm_loadu_ps((const float *)src);
1484                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1485                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1486                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1487                         _mm_store_ps(dst, v);
1488                         dst += 4;
1489                         src += stride;
1490                 }
1491         }
1492         else
1493         {
1494                 while (dst < end)
1495                 {
1496                         __m128 v = _mm_load_ps((const float *)src);
1497                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1498                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1499                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1500                         _mm_store_ps(dst, v);
1501                         dst += 4;
1502                         src += stride;
1503                 }
1504         }
1505 }
1506
1507 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1508 {
1509         float *end = dst + size*4;
1510         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1511         if (stride == sizeof(float[2]))
1512         {
1513                 float *end2 = dst + (size&~1)*4;
1514                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1515                 {
1516                         while (dst < end2)
1517                         {
1518                                 __m128 v = _mm_loadu_ps((const float *)src);
1519                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1520                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1521                                 dst += 8;
1522                                 src += 2*sizeof(float[2]);
1523                         }
1524                 }
1525                 else
1526                 {
1527                         while (dst < end2)
1528                         {
1529                                 __m128 v = _mm_load_ps((const float *)src);
1530                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1531                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1532                                 dst += 8;
1533                                 src += 2*sizeof(float[2]);
1534                         }
1535                 }
1536         }
1537         while (dst < end)
1538         {
1539                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1540                 dst += 4;
1541                 src += stride;
1542         }
1543 }
1544
1545 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1546 {
1547         float *end = dst + size*4;
1548         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1549         if (stride == sizeof(unsigned char[4]))
1550         {
1551                 float *end4 = dst + (size&~3)*4;
1552                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1553                 {
1554                         while (dst < end4)
1555                         {
1556                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1557                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1558                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1559                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1560                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1561                                 dst += 16;
1562                                 src += 4*sizeof(unsigned char[4]);
1563                         }
1564                 }
1565                 else
1566                 {
1567                         while (dst < end4)
1568                         {
1569                                 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1570                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1571                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1572                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1573                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1574                                 dst += 16;
1575                                 src += 4*sizeof(unsigned char[4]);
1576                         }
1577                 }
1578         }
1579         while (dst < end)
1580         {
1581                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1582                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1583                 dst += 4;
1584                 src += stride;
1585         }
1586 }
1587
1588 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1589 {
1590         float *end = dst + 4*size;
1591         __m128 v = _mm_loadu_ps(src);
1592         while (dst < end)
1593         {
1594                 _mm_store_ps(dst, v);
1595                 dst += 4;
1596         }
1597 }
1598 #endif
1599
1600 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1601 {
1602 #ifdef SSE2_PRESENT
1603         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1604         __m128 m0, m1, m2, m3;
1605         float *end;
1606         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1607         {
1608                 // fast case for identity matrix
1609                 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1610                 return;
1611         }
1612         end = out4f + numitems*4;
1613         m0 = _mm_loadu_ps(inmatrix16f);
1614         m1 = _mm_loadu_ps(inmatrix16f + 4);
1615         m2 = _mm_loadu_ps(inmatrix16f + 8);
1616         m3 = _mm_loadu_ps(inmatrix16f + 12);
1617         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1618         {
1619                 while (out4f < end)
1620                 {
1621                         __m128 v = _mm_loadu_ps(in4f);
1622                         _mm_store_ps(out4f,
1623                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1624                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1625                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1626                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1627                         out4f += 4;
1628                         in4f += 4;
1629                 }
1630         }
1631         else
1632         {
1633                 while (out4f < end)
1634                 {
1635                         __m128 v = _mm_load_ps(in4f);
1636                         _mm_store_ps(out4f,
1637                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1638                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1639                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1640                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1641                         out4f += 4;
1642                         in4f += 4;
1643                 }
1644         }
1645 #endif
1646 }
1647
1648 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1649 {
1650         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1651 }
1652
1653 #ifdef SSE2_PRESENT
1654 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1655 { \
1656         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1657         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1658         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1659         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1660 }
1661
1662 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1663 { \
1664         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1665         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1666         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1667         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1668 }
1669
1670 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1671 { \
1672         __m128 p = (in); \
1673         out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1674                                                   _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1675                                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1676                                                                                         _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1677 }
1678
1679 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, __m128 minpos, __m128 maxpos, __m128 viewportcenter, __m128 viewportscale, __m128 m0, __m128 m1, __m128 m2, __m128 m3)
1680 {
1681         int clipmask = 0xFF;
1682         __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1683         m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1684         m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1685         m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1686         m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1687         #define BBFRONT(k, pos) \
1688         { \
1689                 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1690                 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1691                 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1692                 { \
1693                         __m128 proj; \
1694                         clipmask &= ~(1<<k); \
1695                         proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1696                         minproj = _mm_min_ss(minproj, proj); \
1697                         maxproj = _mm_max_ss(maxproj, proj); \
1698                 } \
1699         }
1700         BBFRONT(0, minpos); 
1701         BBFRONT(1, _mm_move_ss(minpos, maxpos)); 
1702         BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1703         BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1704         BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1705         BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1706         BBFRONT(6, _mm_move_ss(maxpos, minpos)); 
1707         BBFRONT(7, maxpos);
1708         #define BBCLIP(k) \
1709         { \
1710                 if (clipmask&(1<<k)) \
1711                 { \
1712                         if (!(clipmask&(1<<(k^1)))) \
1713                         { \
1714                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1715                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1716                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1717                                 minproj = _mm_min_ss(minproj, proj); \
1718                                 maxproj = _mm_max_ss(maxproj, proj); \
1719                         } \
1720                         if (!(clipmask&(1<<(k^2)))) \
1721                         { \
1722                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1723                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1724                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1725                                 minproj = _mm_min_ss(minproj, proj); \
1726                                 maxproj = _mm_max_ss(maxproj, proj); \
1727                         } \
1728                         if (!(clipmask&(1<<(k^4)))) \
1729                         { \
1730                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1731                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1732                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1733                                 minproj = _mm_min_ss(minproj, proj); \
1734                                 maxproj = _mm_max_ss(maxproj, proj); \
1735                         } \
1736                 } \
1737         }
1738         BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1739         viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1740         viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1741         minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1742         maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1743         minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1744         maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1745         *starty = _mm_cvttss_si32(maxproj);
1746         *endy = _mm_cvttss_si32(minproj)+1;
1747         return clipmask;
1748 }
1749 #endif
1750         
1751 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1752 {
1753 #ifdef SSE2_PRESENT
1754         float *end = out4f + numitems*4;
1755         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1756         __m128 minpos, maxpos;
1757         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1758         {
1759                 minpos = maxpos = _mm_loadu_ps(in4f);
1760                 while (out4f < end)
1761                 {
1762                         __m128 v = _mm_loadu_ps(in4f);
1763                         minpos = _mm_min_ps(minpos, v);
1764                         maxpos = _mm_max_ps(maxpos, v);
1765                         _mm_store_ps(out4f, v);
1766                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1767                         _mm_store_ps(screen4f, v);
1768                         in4f += 4;
1769                         out4f += 4;
1770                         screen4f += 4;
1771                 }
1772         }
1773         else
1774         {
1775                 minpos = maxpos = _mm_load_ps(in4f);
1776                 while (out4f < end)
1777                 {
1778                         __m128 v = _mm_load_ps(in4f);
1779                         minpos = _mm_min_ps(minpos, v);
1780                         maxpos = _mm_max_ps(maxpos, v);
1781                         _mm_store_ps(out4f, v);
1782                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1783                         _mm_store_ps(screen4f, v);
1784                         in4f += 4;
1785                         out4f += 4;
1786                         screen4f += 4;
1787                 }
1788         }
1789         if (starty && endy) 
1790                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, 
1791                                         _mm_setr_ps(1.0f, 0.0f, 0.0f, 0.0f),
1792                                         _mm_setr_ps(0.0f, 1.0f, 0.0f, 0.0f),
1793                                         _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f),
1794                                         _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f));
1795         return 0;
1796 #endif
1797 }
1798
1799 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1800 {
1801 #ifdef SSE2_PRESENT
1802         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1803         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1804         float *end;
1805         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1806                 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1807         end = out4f + numitems*4;
1808         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1809         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1810         m0 = _mm_loadu_ps(inmatrix16f);
1811         m1 = _mm_loadu_ps(inmatrix16f + 4);
1812         m2 = _mm_loadu_ps(inmatrix16f + 8);
1813         m3 = _mm_loadu_ps(inmatrix16f + 12);
1814         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1815         {
1816                 minpos = maxpos = _mm_loadu_ps(in4f);
1817                 while (out4f < end)
1818                 {
1819                         __m128 v = _mm_loadu_ps(in4f);
1820                         minpos = _mm_min_ps(minpos, v);
1821                         maxpos = _mm_max_ps(maxpos, v);
1822                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1823                         _mm_store_ps(out4f, v);
1824                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1825                         _mm_store_ps(screen4f, v);
1826                         in4f += 4;
1827                         out4f += 4;
1828                         screen4f += 4;
1829                 }
1830         }
1831         else
1832         {
1833                 minpos = maxpos = _mm_load_ps(in4f);
1834                 while (out4f < end)
1835                 {
1836                         __m128 v = _mm_load_ps(in4f);
1837                         minpos = _mm_min_ps(minpos, v);
1838                         maxpos = _mm_max_ps(maxpos, v);
1839                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1840                         _mm_store_ps(out4f, v);
1841                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1842                         _mm_store_ps(screen4f, v);
1843                         in4f += 4;
1844                         out4f += 4;
1845                         screen4f += 4;
1846                 }
1847         }
1848         if (starty && endy) 
1849                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, m0, m1, m2, m3); 
1850         return 0;
1851 #endif
1852 }
1853
1854 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1855 {
1856         float *outf = dpsoftrast.post_array4f[outarray];
1857         const unsigned char *inb;
1858         int firstvertex = dpsoftrast.firstvertex;
1859         int numvertices = dpsoftrast.numvertices;
1860         int stride;
1861         switch(inarray)
1862         {
1863         case DPSOFTRAST_ARRAY_POSITION:
1864                 stride = dpsoftrast.stride_vertex;
1865                 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1866                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1867                 break;
1868         case DPSOFTRAST_ARRAY_COLOR:
1869                 stride = dpsoftrast.stride_color;
1870                 if (dpsoftrast.pointer_color4f)
1871                 {
1872                         inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1873                         DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1874                 }
1875                 else if (dpsoftrast.pointer_color4ub)
1876                 {
1877                         stride = dpsoftrast.stride_color;
1878                         inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1879                         DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1880                 }
1881                 else
1882                 {
1883                         DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1884                 }
1885                 break;
1886         default:
1887                 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1888                 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1889                 {
1890                         inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1891                         switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1892                         {
1893                         case 2:
1894                                 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1895                                 break;
1896                         case 3:
1897                                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1898                                 break;
1899                         case 4:
1900                                 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1901                                 break;
1902                         }
1903                 }
1904                 break;
1905         }
1906         return outf;
1907 }
1908
1909 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1910 {
1911         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1912         DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1913         return data;
1914 }
1915
1916 #if 0
1917 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1918 {
1919         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1920         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1921         return data;
1922 }
1923 #endif
1924
1925 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1926 {
1927         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1928         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
1929         return data;
1930 }
1931
1932 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1933 {
1934         int x;
1935         int startx = span->startx;
1936         int endx = span->endx;
1937         float wslope = triangle->w[0];
1938         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1939         float endz = 1.0f / (w + wslope * startx);
1940         for (x = startx;x < endx;)
1941         {
1942                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1943                 float z = endz, dz;
1944                 if(nextsub >= endx) nextsub = endsub = endx-1;
1945                 endz = 1.0f / (w + wslope * nextsub);
1946                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1947                 for (; x <= endsub; x++, z += dz)
1948                         zf[x] = z;
1949         }
1950 }
1951
1952 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
1953 {
1954         int x;
1955         int startx = span->startx;
1956         int endx = span->endx;
1957         int d[4];
1958         float a, b;
1959         unsigned char * RESTRICT pixelmask = span->pixelmask;
1960         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1961         if (!pixel)
1962                 return;
1963         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
1964         // handle alphatest now (this affects depth writes too)
1965         if (thread->alphatest)
1966                 for (x = startx;x < endx;x++)
1967                         if (in4f[x*4+3] < 0.5f)
1968                                 pixelmask[x] = false;
1969         // FIXME: this does not handle bigendian
1970         switch(thread->fb_blendmode)
1971         {
1972         case DPSOFTRAST_BLENDMODE_OPAQUE:
1973                 for (x = startx;x < endx;x++)
1974                 {
1975                         if (!pixelmask[x])
1976                                 continue;
1977                         d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
1978                         d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
1979                         d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
1980                         d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
1981                         pixel[x*4+0] = d[0];
1982                         pixel[x*4+1] = d[1];
1983                         pixel[x*4+2] = d[2];
1984                         pixel[x*4+3] = d[3];
1985                 }
1986                 break;
1987         case DPSOFTRAST_BLENDMODE_ALPHA:
1988                 for (x = startx;x < endx;x++)
1989                 {
1990                         if (!pixelmask[x])
1991                                 continue;
1992                         a = in4f[x*4+3] * 255.0f;
1993                         b = 1.0f - in4f[x*4+3];
1994                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
1995                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
1996                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
1997                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
1998                         pixel[x*4+0] = d[0];
1999                         pixel[x*4+1] = d[1];
2000                         pixel[x*4+2] = d[2];
2001                         pixel[x*4+3] = d[3];
2002                 }
2003                 break;
2004         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2005                 for (x = startx;x < endx;x++)
2006                 {
2007                         if (!pixelmask[x])
2008                                 continue;
2009                         a = in4f[x*4+3] * 255.0f;
2010                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2011                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2012                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2013                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2014                         pixel[x*4+0] = d[0];
2015                         pixel[x*4+1] = d[1];
2016                         pixel[x*4+2] = d[2];
2017                         pixel[x*4+3] = d[3];
2018                 }
2019                 break;
2020         case DPSOFTRAST_BLENDMODE_ADD:
2021                 for (x = startx;x < endx;x++)
2022                 {
2023                         if (!pixelmask[x])
2024                                 continue;
2025                         d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2026                         d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2027                         d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2028                         d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2029                         pixel[x*4+0] = d[0];
2030                         pixel[x*4+1] = d[1];
2031                         pixel[x*4+2] = d[2];
2032                         pixel[x*4+3] = d[3];
2033                 }
2034                 break;
2035         case DPSOFTRAST_BLENDMODE_INVMOD:
2036                 for (x = startx;x < endx;x++)
2037                 {
2038                         if (!pixelmask[x])
2039                                 continue;
2040                         d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2041                         d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2042                         d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2043                         d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2044                         pixel[x*4+0] = d[0];
2045                         pixel[x*4+1] = d[1];
2046                         pixel[x*4+2] = d[2];
2047                         pixel[x*4+3] = d[3];
2048                 }
2049                 break;
2050         case DPSOFTRAST_BLENDMODE_MUL:
2051                 for (x = startx;x < endx;x++)
2052                 {
2053                         if (!pixelmask[x])
2054                                 continue;
2055                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2056                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2057                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2058                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2059                         pixel[x*4+0] = d[0];
2060                         pixel[x*4+1] = d[1];
2061                         pixel[x*4+2] = d[2];
2062                         pixel[x*4+3] = d[3];
2063                 }
2064                 break;
2065         case DPSOFTRAST_BLENDMODE_MUL2:
2066                 for (x = startx;x < endx;x++)
2067                 {
2068                         if (!pixelmask[x])
2069                                 continue;
2070                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2071                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2072                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2073                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2074                         pixel[x*4+0] = d[0];
2075                         pixel[x*4+1] = d[1];
2076                         pixel[x*4+2] = d[2];
2077                         pixel[x*4+3] = d[3];
2078                 }
2079                 break;
2080         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2081                 for (x = startx;x < endx;x++)
2082                 {
2083                         if (!pixelmask[x])
2084                                 continue;
2085                         a = in4f[x*4+3] * -255.0f;
2086                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2087                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2088                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2089                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2090                         pixel[x*4+0] = d[0];
2091                         pixel[x*4+1] = d[1];
2092                         pixel[x*4+2] = d[2];
2093                         pixel[x*4+3] = d[3];
2094                 }
2095                 break;
2096         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2097                 for (x = startx;x < endx;x++)
2098                 {
2099                         if (!pixelmask[x])
2100                                 continue;
2101                         a = 255.0f;
2102                         b = 1.0f - in4f[x*4+3];
2103                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2104                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2105                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2106                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2107                         pixel[x*4+0] = d[0];
2108                         pixel[x*4+1] = d[1];
2109                         pixel[x*4+2] = d[2];
2110                         pixel[x*4+3] = d[3];
2111                 }
2112                 break;
2113         }
2114 }
2115
2116 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2117 {
2118 #ifdef SSE2_PRESENT
2119         int x;
2120         int startx = span->startx;
2121         int endx = span->endx;
2122         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2123         unsigned char * RESTRICT pixelmask = span->pixelmask;
2124         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2125         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2126         if (!pixel)
2127                 return;
2128         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2129         pixeli += span->y * dpsoftrast.fb_width + span->x;
2130         // handle alphatest now (this affects depth writes too)
2131         if (thread->alphatest)
2132                 for (x = startx;x < endx;x++)
2133                         if (in4ub[x*4+3] < 0.5f)
2134                                 pixelmask[x] = false;
2135         // FIXME: this does not handle bigendian
2136         switch(thread->fb_blendmode)
2137         {
2138         case DPSOFTRAST_BLENDMODE_OPAQUE:
2139                 for (x = startx;x + 4 <= endx;)
2140                 {
2141                         if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2142                         {
2143                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2144                                 x += 4;
2145                         }
2146                         else
2147                         {
2148                                 if (pixelmask[x])
2149                                         pixeli[x] = ini[x];
2150                                 x++;
2151                         }
2152                 }
2153                 for (;x < endx;x++)
2154                         if (pixelmask[x])
2155                                 pixeli[x] = ini[x];
2156                 break;
2157         case DPSOFTRAST_BLENDMODE_ALPHA:
2158         #define FINISHBLEND(blend2, blend1) \
2159                 for (x = startx;x + 2 <= endx;x += 2) \
2160                 { \
2161                         __m128i src, dst; \
2162                         switch (*(const unsigned short*)&pixelmask[x]) \
2163                         { \
2164                         case 0x0101: \
2165                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2166                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2167                                 blend2; \
2168                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2169                                 continue; \
2170                         case 0x0100: \
2171                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2172                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2173                                 blend1; \
2174                                 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst));  \
2175                                 continue; \
2176                         case 0x0001: \
2177                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2178                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2179                                 blend1; \
2180                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2181                                 continue; \
2182                         } \
2183                         break; \
2184                 } \
2185                 for(;x < endx; x++) \
2186                 { \
2187                         __m128i src, dst; \
2188                         if (!pixelmask[x]) \
2189                                 continue; \
2190                         src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2191                         dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2192                         blend1; \
2193                         pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2194                 }
2195
2196                 FINISHBLEND({
2197                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2198                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2199                 }, {
2200                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2201                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2202                 });
2203                 break;
2204         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2205                 FINISHBLEND({
2206                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2207                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2208                 }, {
2209                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2210                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2211                 });
2212                 break;
2213         case DPSOFTRAST_BLENDMODE_ADD:
2214                 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2215                 break;
2216         case DPSOFTRAST_BLENDMODE_INVMOD:
2217                 FINISHBLEND({
2218                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2219                 }, {
2220                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2221                 });
2222                 break;
2223         case DPSOFTRAST_BLENDMODE_MUL:
2224                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2225                 break;
2226         case DPSOFTRAST_BLENDMODE_MUL2:
2227                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2228                 break;
2229         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2230                 FINISHBLEND({
2231                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2232                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2233                 }, {
2234                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2235                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2236                 });
2237                 break;
2238         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2239                 FINISHBLEND({
2240                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2241                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2242                 }, {
2243                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2244                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2245                 });
2246                 break;
2247         }
2248 #endif
2249 }
2250
2251 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2252 {
2253         int x;
2254         int startx = span->startx;
2255         int endx = span->endx;
2256         int flags;
2257         float c[4];
2258         float data[4];
2259         float slope[4];
2260         float tc[2], endtc[2];
2261         float tcscale[2];
2262         unsigned int tci[2];
2263         unsigned int tci1[2];
2264         unsigned int tcimin[2];
2265         unsigned int tcimax[2];
2266         int tciwrapmask[2];
2267         int tciwidth;
2268         int filter;
2269         int mip;
2270         const unsigned char * RESTRICT pixelbase;
2271         const unsigned char * RESTRICT pixel[4];
2272         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2273         // if no texture is bound, just fill it with white
2274         if (!texture)
2275         {
2276                 for (x = startx;x < endx;x++)
2277                 {
2278                         out4f[x*4+0] = 1.0f;
2279                         out4f[x*4+1] = 1.0f;
2280                         out4f[x*4+2] = 1.0f;
2281                         out4f[x*4+3] = 1.0f;
2282                 }
2283                 return;
2284         }
2285         mip = triangle->mip[texunitindex];
2286         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2287         // if this mipmap of the texture is 1 pixel, just fill it with that color
2288         if (texture->mipmap[mip][1] == 4)
2289         {
2290                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2291                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2292                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2293                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2294                 for (x = startx;x < endx;x++)
2295                 {
2296                         out4f[x*4+0] = c[0];
2297                         out4f[x*4+1] = c[1];
2298                         out4f[x*4+2] = c[2];
2299                         out4f[x*4+3] = c[3];
2300                 }
2301                 return;
2302         }
2303         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2304         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2305         flags = texture->flags;
2306         tcscale[0] = texture->mipmap[mip][2];
2307         tcscale[1] = texture->mipmap[mip][3];
2308         tciwidth = texture->mipmap[mip][2];
2309         tcimin[0] = 0;
2310         tcimin[1] = 0;
2311         tcimax[0] = texture->mipmap[mip][2]-1;
2312         tcimax[1] = texture->mipmap[mip][3]-1;
2313         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2314         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2315         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
2316         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
2317         for (x = startx;x < endx;)
2318         {
2319                 unsigned int subtc[2];
2320                 unsigned int substep[2];
2321                 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2322                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2323                 if(nextsub >= endx)
2324                 {
2325                         nextsub = endsub = endx-1;      
2326                         if(x < nextsub) subscale = 65536.0f / (nextsub - x);
2327                 }
2328                 tc[0] = endtc[0];
2329                 tc[1] = endtc[1];
2330                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
2331                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
2332                 substep[0] = (endtc[0] - tc[0]) * subscale;
2333                 substep[1] = (endtc[1] - tc[1]) * subscale;
2334                 subtc[0] = tc[0] * (1<<16);
2335                 subtc[1] = tc[1] * (1<<16);
2336                 if(filter)
2337                 {
2338                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2339                         {
2340                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2341                                 {
2342                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2343                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2344                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2345                                         tci[0] = subtc[0]>>16;
2346                                         tci[1] = subtc[1]>>16;
2347                                         tci1[0] = tci[0] + 1;
2348                                         tci1[1] = tci[1] + 1;
2349                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2350                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2351                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2352                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2353                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2354                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2355                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2356                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2357                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2358                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2359                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2360                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2361                                         out4f[x*4+0] = c[0];
2362                                         out4f[x*4+1] = c[1];
2363                                         out4f[x*4+2] = c[2];
2364                                         out4f[x*4+3] = c[3];
2365                                 }
2366                         }
2367                         else
2368                         {
2369                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2370                                 {
2371                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2372                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2373                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2374                                         tci[0] = subtc[0]>>16;
2375                                         tci[1] = subtc[1]>>16;
2376                                         tci1[0] = tci[0] + 1;
2377                                         tci1[1] = tci[1] + 1;
2378                                         tci[0] &= tciwrapmask[0];
2379                                         tci[1] &= tciwrapmask[1];
2380                                         tci1[0] &= tciwrapmask[0];
2381                                         tci1[1] &= tciwrapmask[1];
2382                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2383                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2384                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2385                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2386                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2387                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2388                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2389                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2390                                         out4f[x*4+0] = c[0];
2391                                         out4f[x*4+1] = c[1];
2392                                         out4f[x*4+2] = c[2];
2393                                         out4f[x*4+3] = c[3];
2394                                 }
2395                         }
2396                 }
2397                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2398                 {
2399                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2400                         {
2401                                 tci[0] = subtc[0]>>16;
2402                                 tci[1] = subtc[1]>>16;
2403                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2404                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2405                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2406                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2407                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2408                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2409                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2410                                 out4f[x*4+0] = c[0];
2411                                 out4f[x*4+1] = c[1];
2412                                 out4f[x*4+2] = c[2];
2413                                 out4f[x*4+3] = c[3];
2414                         }
2415                 }
2416                 else
2417                 {
2418                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2419                         {
2420                                 tci[0] = subtc[0]>>16;
2421                                 tci[1] = subtc[1]>>16;
2422                                 tci[0] &= tciwrapmask[0];
2423                                 tci[1] &= tciwrapmask[1];
2424                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2425                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2426                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2427                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2428                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2429                                 out4f[x*4+0] = c[0];
2430                                 out4f[x*4+1] = c[1];
2431                                 out4f[x*4+2] = c[2];
2432                                 out4f[x*4+3] = c[3];
2433                         }
2434                 }
2435         }
2436 }
2437
2438 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2439 {
2440 #ifdef SSE2_PRESENT
2441         int x;
2442         int startx = span->startx;
2443         int endx = span->endx;
2444         int flags;
2445         __m128 data, slope, tcscale;
2446         __m128i tcsize, tcmask, tcoffset, tcmax;
2447         __m128 tc, endtc;
2448         __m128i subtc, substep, endsubtc;
2449         int filter;
2450         int mip;
2451         unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2452         const unsigned char * RESTRICT pixelbase;
2453         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2454         // if no texture is bound, just fill it with white
2455         if (!texture)
2456         {
2457                 memset(out4ub + startx*4, 255, span->length*4);
2458                 return;
2459         }
2460         mip = triangle->mip[texunitindex];
2461         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2462         // if this mipmap of the texture is 1 pixel, just fill it with that color
2463         if (texture->mipmap[mip][1] == 4)
2464         {
2465                 unsigned int k = *((const unsigned int *)pixelbase);
2466                 for (x = startx;x < endx;x++)
2467                         outi[x] = k;
2468                 return;
2469         }
2470         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2471         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2472         flags = texture->flags;
2473         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2474         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2475         tcscale = _mm_cvtepi32_ps(tcsize);
2476         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2477         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2478         endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
2479         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2480         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2481         tcmax = _mm_packs_epi32(tcmask, tcmask);
2482         for (x = startx;x < endx;)
2483         {
2484                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2485                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2486                 if(nextsub >= endx)
2487                 {
2488                         nextsub = endsub = endx-1;
2489                         if(x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2490                 }       
2491                 tc = endtc;
2492                 subtc = endsubtc;
2493                 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
2494                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2495                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2496                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2497                 substep = _mm_slli_epi32(substep, 1);
2498                 if (filter)
2499                 {
2500                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2501                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2502                         {
2503                                 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2504                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2505                                 {
2506                                         const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;                   
2507                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2508                                         tci = _mm_madd_epi16(tci, tcoffset);
2509                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2510                                         ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2511                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2512                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2513                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2514                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2515                                         fracm = _mm_srli_epi16(subtc, 1);
2516                                         pix1 = _mm_add_epi16(pix1,
2517                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2518                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2519                                         pix3 = _mm_add_epi16(pix3,
2520                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2521                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2522                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2523                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2524                                         pix2 = _mm_add_epi16(pix2,
2525                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2526                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2527                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2528                                 }
2529                                 if (x <= endsub)
2530                                 {
2531                                         const unsigned char * RESTRICT ptr1;
2532                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2533                                         tci = _mm_madd_epi16(tci, tcoffset);
2534                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2535                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2536                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2537                                         fracm = _mm_srli_epi16(subtc, 1);
2538                                         pix1 = _mm_add_epi16(pix1,
2539                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2540                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2541                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2542                                         pix1 = _mm_add_epi16(pix1,
2543                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2544                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2545                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2546                                         x++;
2547                                 }
2548                         }
2549                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2550                         {
2551                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2552                                 {
2553                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2554                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2555                                         tci = _mm_madd_epi16(tci, tcoffset);
2556                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2557                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2558                                                                                         _mm_setzero_si128());
2559                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2560                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2561                                                                                         _mm_setzero_si128());
2562                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2563                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2564                                         tci = _mm_madd_epi16(tci, tcoffset);
2565                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2566                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2567                                                                                         _mm_setzero_si128());
2568                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2569                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2570                                                                                         _mm_setzero_si128());
2571                                         fracm = _mm_srli_epi16(subtc, 1);
2572                                         pix1 = _mm_add_epi16(pix1,
2573                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2574                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2575                                         pix3 = _mm_add_epi16(pix3,
2576                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2577                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2578                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2579                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2580                                         pix2 = _mm_add_epi16(pix2,
2581                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2582                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2583                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2584                                 }
2585                                 if (x <= endsub)
2586                                 {
2587                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2588                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2589                                         tci = _mm_madd_epi16(tci, tcoffset);
2590                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2591                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2592                                                                                         _mm_setzero_si128());
2593                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2594                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2595                                                                                         _mm_setzero_si128());
2596                                         fracm = _mm_srli_epi16(subtc, 1);
2597                                         pix1 = _mm_add_epi16(pix1,
2598                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2599                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2600                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2601                                         pix1 = _mm_add_epi16(pix1,
2602                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2603                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2604                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2605                                         x++;
2606                                 }
2607                         }
2608                         else
2609                         {
2610                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2611                                 {
2612                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2613                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2614                                         tci = _mm_madd_epi16(tci, tcoffset);
2615                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2616                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2617                                                                                         _mm_setzero_si128());
2618                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2619                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2620                                                                                         _mm_setzero_si128());
2621                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2622                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2623                                         tci = _mm_madd_epi16(tci, tcoffset);
2624                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2625                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2626                                                                                         _mm_setzero_si128());
2627                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2628                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2629                                                                                         _mm_setzero_si128());
2630                                         fracm = _mm_srli_epi16(subtc, 1);
2631                                         pix1 = _mm_add_epi16(pix1,
2632                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2633                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2634                                         pix3 = _mm_add_epi16(pix3,
2635                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2636                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2637                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2638                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2639                                         pix2 = _mm_add_epi16(pix2,
2640                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2641                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2642                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2643                                 }
2644                                 if (x <= endsub)
2645                                 {
2646                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2647                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2648                                         tci = _mm_madd_epi16(tci, tcoffset);
2649                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2650                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2651                                                                                         _mm_setzero_si128());
2652                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2653                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2654                                                                                         _mm_setzero_si128());
2655                                         fracm = _mm_srli_epi16(subtc, 1);
2656                                         pix1 = _mm_add_epi16(pix1,
2657                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2658                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2659                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2660                                         pix1 = _mm_add_epi16(pix1,
2661                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2662                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2663                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2664                                         x++;
2665                                 }
2666                         }
2667                 }
2668                 else
2669                 {
2670                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2671                         {
2672                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2673                                 {
2674                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2675                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2676                                         tci = _mm_madd_epi16(tci, tcoffset);
2677                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2678                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2679                                 }
2680                                 if (x <= endsub)
2681                                 {
2682                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2683                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2684                                         tci = _mm_madd_epi16(tci, tcoffset);
2685                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2686                                         x++;
2687                                 }
2688                         }
2689                         else
2690                         {
2691                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2692                                 {
2693                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2694                                         tci = _mm_and_si128(tci, tcmax); 
2695                                         tci = _mm_madd_epi16(tci, tcoffset);
2696                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2697                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2698                                 }
2699                                 if (x <= endsub)
2700                                 {
2701                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2702                                         tci = _mm_and_si128(tci, tcmax); 
2703                                         tci = _mm_madd_epi16(tci, tcoffset);
2704                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2705                                         x++;
2706                                 }
2707                         }
2708                 }
2709         }
2710 #endif
2711 }
2712
2713 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2714 {
2715         // TODO: IMPLEMENT
2716         memset(out4ub, 255, span->length*4);
2717 }
2718
2719 float DPSOFTRAST_SampleShadowmap(const float *vector)
2720 {
2721         // TODO: IMPLEMENT
2722         return 1.0f;
2723 }
2724
2725 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2726 {
2727         int x;
2728         int startx = span->startx;
2729         int endx = span->endx;
2730         float c[4];
2731         float data[4];
2732         float slope[4];
2733         float z;
2734         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2735         for (x = startx;x < endx;x++)
2736         {
2737                 z = zf[x];
2738                 c[0] = (data[0] + slope[0]*x) * z;
2739                 c[1] = (data[1] + slope[1]*x) * z;
2740                 c[2] = (data[2] + slope[2]*x) * z;
2741                 c[3] = (data[3] + slope[3]*x) * z;
2742                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2743                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2744                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2745                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2746         }
2747 }
2748
2749 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2750 {
2751         int x;
2752         int startx = span->startx;
2753         int endx = span->endx;
2754         float c[4];
2755         float data[4];
2756         float slope[4];
2757         float z;
2758         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2759         for (x = startx;x < endx;x++)
2760         {
2761                 z = zf[x];
2762                 c[0] = (data[0] + slope[0]*x) * z;
2763                 c[1] = (data[1] + slope[1]*x) * z;
2764                 c[2] = (data[2] + slope[2]*x) * z;
2765                 c[3] = (data[3] + slope[3]*x) * z;
2766                 out4f[x*4+0] = c[0];
2767                 out4f[x*4+1] = c[1];
2768                 out4f[x*4+2] = c[2];
2769                 out4f[x*4+3] = c[3];
2770         }
2771 }
2772
2773 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2774 {
2775         int x, startx = span->startx, endx = span->endx;
2776         float c[4], localcolor[4];
2777         localcolor[0] = subcolor[0];
2778         localcolor[1] = subcolor[1];
2779         localcolor[2] = subcolor[2];
2780         localcolor[3] = subcolor[3];
2781         for (x = startx;x < endx;x++)
2782         {
2783                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2784                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2785                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2786                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2787                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2788                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2789                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2790                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2791         }
2792 }
2793
2794 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2795 {
2796         int x, startx = span->startx, endx = span->endx;
2797         for (x = startx;x < endx;x++)
2798         {
2799                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2800                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2801                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2802                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2803         }
2804 }
2805
2806 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2807 {
2808         int x, startx = span->startx, endx = span->endx;
2809         for (x = startx;x < endx;x++)
2810         {
2811                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2812                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2813                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2814                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2815         }
2816 }
2817
2818 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2819 {
2820         int x, startx = span->startx, endx = span->endx;
2821         float a, b;
2822         for (x = startx;x < endx;x++)
2823         {
2824                 a = 1.0f - inb4f[x*4+3];
2825                 b = inb4f[x*4+3];
2826                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2827                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2828                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2829                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2830         }
2831 }
2832
2833 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2834 {
2835         int x, startx = span->startx, endx = span->endx;
2836         float localcolor[4], ilerp, lerp;
2837         localcolor[0] = color[0];
2838         localcolor[1] = color[1];
2839         localcolor[2] = color[2];
2840         localcolor[3] = color[3];
2841         ilerp = 1.0f - localcolor[3];
2842         lerp = localcolor[3];
2843         for (x = startx;x < endx;x++)
2844         {
2845                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2846                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2847                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2848                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2849         }
2850 }
2851
2852
2853
2854 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2855 {
2856 #ifdef SSE2_PRESENT
2857         int x;
2858         int startx = span->startx;
2859         int endx = span->endx;
2860         __m128 data, slope;
2861         __m128 mod, endmod;
2862         __m128i submod, substep, endsubmod;
2863         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2864         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2865         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2866         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2867         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2868         for (x = startx; x < endx;)
2869         {
2870                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2871                 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2872                 if(nextsub >= endx)
2873                 {
2874                         nextsub = endsub = endx-1;
2875                         if(x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2876                 }
2877                 mod = endmod;
2878                 submod = endsubmod;
2879                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2880                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2881                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2882                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2883                 substep = _mm_packs_epi32(substep, substep);
2884                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2885                 {
2886                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2887                         pix = _mm_mulhi_epu16(pix, submod);
2888                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2889                 }
2890                 if (x <= endsub)
2891                 {
2892                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2893                         pix = _mm_mulhi_epu16(pix, submod);
2894                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2895                         x++;
2896                 }
2897         }
2898 #endif
2899 }
2900
2901 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2902 {
2903 #ifdef SSE2_PRESENT
2904         int x;
2905         int startx = span->startx;
2906         int endx = span->endx;
2907         __m128 data, slope;
2908         __m128 mod, endmod;
2909         __m128i submod, substep, endsubmod;
2910         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2911         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2912         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2913         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2914         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2915         for (x = startx; x < endx;)
2916         {
2917                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2918                 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2919                 if(nextsub >= endx)
2920                 {
2921                         nextsub = endsub = endx-1;
2922                         if(x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
2923                 }
2924                 mod = endmod;
2925                 submod = endsubmod;
2926                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2927                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2928                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2929                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2930                 substep = _mm_packs_epi32(substep, substep);
2931                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2932                 {
2933                         __m128i pix = _mm_srai_epi16(submod, 4);
2934                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2935                 }
2936                 if (x <= endsub)
2937                 {
2938                         __m128i pix = _mm_srai_epi16(submod, 4);
2939                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2940                         x++;
2941                 }
2942         }
2943 #endif
2944 }
2945
2946 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
2947 {
2948 #ifdef SSE2_PRESENT
2949         int x, startx = span->startx, endx = span->endx;
2950         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
2951         localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
2952         for (x = startx;x+2 <= endx;x+=2)
2953         {
2954                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2955                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2956                 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2957                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2958         }
2959         if(x < endx)
2960         {
2961                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2962                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
2963                 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2964                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2965         }
2966 #endif
2967 }
2968
2969 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
2970 {
2971 #ifdef SSE2_PRESENT
2972         int x, startx = span->startx, endx = span->endx;
2973         for (x = startx;x+2 <= endx;x+=2)
2974         {
2975                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2976                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
2977                 pix1 = _mm_mulhi_epu16(pix1, pix2);
2978                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2979         }
2980         if(x < endx)
2981         {
2982                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2983                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
2984                 pix1 = _mm_mulhi_epu16(pix1, pix2);
2985                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2986         }
2987 #endif
2988 }
2989
2990 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
2991 {
2992 #ifdef SSE2_PRESENT
2993         int x, startx = span->startx, endx = span->endx;
2994         for (x = startx;x+2 <= endx;x+=2)
2995         {
2996                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2997                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2998                 pix1 = _mm_add_epi16(pix1, pix2);
2999                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3000         }
3001         if(x < endx)
3002         {
3003                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3004                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3005                 pix1 = _mm_add_epi16(pix1, pix2);
3006                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3007         }
3008 #endif
3009 }
3010
3011 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3012 {
3013 #ifdef SSE2_PRESENT
3014         int x, startx = span->startx, endx = span->endx;
3015         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3016         tint = _mm_shuffle_epi32(_mm_packs_epi32(tint, tint), _MM_SHUFFLE(1, 0, 1, 0));
3017         for (x = startx;x+2 <= endx;x+=2)
3018         {
3019                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3020                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3021                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3022                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3023         }
3024         if(x < endx)
3025         {
3026                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3027                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3028                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3029                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3030         }
3031 #endif
3032 }
3033
3034 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3035 {
3036 #ifdef SSE2_PRESENT
3037         int x, startx = span->startx, endx = span->endx;
3038         for (x = startx;x+2 <= endx;x+=2)
3039         {
3040                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3041                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3042                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3043                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3044                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3045         }
3046         if(x < endx)
3047         {
3048                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3049                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3050                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3051                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3052                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3053         }
3054 #endif
3055 }
3056
3057 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3058 {
3059 #ifdef SSE2_PRESENT
3060         int x, startx = span->startx, endx = span->endx;
3061         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3062         localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
3063         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3064         for (x = startx;x+2 <= endx;x+=2)
3065         {
3066                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3067                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3068                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3069         }
3070         if(x < endx)
3071         {
3072                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3073                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3074                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3075         }
3076 #endif
3077 }
3078
3079
3080
3081 void DPSOFTRAST_VertexShader_Generic(void)
3082 {
3083         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3084         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3085         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3086         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3087                 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3088 }
3089
3090 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3091 {
3092         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3093         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3094         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3095         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3096         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3097         if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3098         {
3099                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3100                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3101                 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3102                 {
3103                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3104                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3105                         {
3106                                 // multiply
3107                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3108                         }
3109                         else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3110                         {
3111                                 // add
3112                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3113                         }
3114                         else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3115                         {
3116                                 // alphablend
3117                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3118                         }
3119                 }
3120         }
3121         else
3122                 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3123         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3124 }
3125
3126
3127
3128 void DPSOFTRAST_VertexShader_PostProcess(void)
3129 {
3130         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3131         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3132         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3133 }
3134
3135 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3136 {
3137         // TODO: optimize!!  at the very least there is no reason to use texture sampling on the frame texture
3138         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3139         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3140         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3141         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3142         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3143         if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3144         {
3145                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3146                 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3147         }
3148         DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3149         if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3150         {
3151                 // TODO: implement saturation
3152         }
3153         if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3154         {
3155                 // TODO: implement gammaramps
3156         }
3157         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3158 }
3159
3160
3161
3162 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3163 {
3164         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3165 }
3166
3167 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3168 {
3169         // this is never called (because colormask is off when this shader is used)
3170         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3171         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3172         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3173         memset(buffer_FragColorbgra8, 0, span->length*4);
3174         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3175 }
3176
3177
3178
3179 void DPSOFTRAST_VertexShader_FlatColor(void)
3180 {
3181         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3182         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3183 }
3184
3185 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3186 {
3187         int x, startx = span->startx, endx = span->endx;
3188         int Color_Ambienti[4];
3189         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3190         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3191         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3192         Color_Ambienti[2] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0]*256.0f);
3193         Color_Ambienti[1] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1]*256.0f);
3194         Color_Ambienti[0] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2]*256.0f);
3195         Color_Ambienti[3] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]        *256.0f);
3196         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3197         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3198         for (x = startx;x < endx;x++)
3199         {
3200                 buffer_FragColorbgra8[x*4+0] = (buffer_texture_colorbgra8[x*4+0] * Color_Ambienti[0])>>8;
3201                 buffer_FragColorbgra8[x*4+1] = (buffer_texture_colorbgra8[x*4+1] * Color_Ambienti[1])>>8;
3202                 buffer_FragColorbgra8[x*4+2] = (buffer_texture_colorbgra8[x*4+2] * Color_Ambienti[2])>>8;
3203                 buffer_FragColorbgra8[x*4+3] = (buffer_texture_colorbgra8[x*4+3] * Color_Ambienti[3])>>8;
3204         }
3205         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3206 }
3207
3208
3209
3210 void DPSOFTRAST_VertexShader_VertexColor(void)
3211 {
3212         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3213         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3214         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3215 }
3216
3217 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3218 {
3219 #ifdef SSE2_PRESENT
3220         unsigned char * RESTRICT pixelmask = span->pixelmask;
3221         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3222         int x, startx = span->startx, endx = span->endx;
3223         __m128i Color_Ambientm, Color_Diffusem;
3224         __m128 data, slope;
3225         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3226         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3227         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3228         int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3229         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3230         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3231         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3232                 pixel = buffer_FragColorbgra8;
3233         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3234         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3235         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3236         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3237         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3238         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3239         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3240         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3241         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3242         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3243         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3244         data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3245         slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3246         for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3247         {
3248                 __m128i color, mod, pix;
3249                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3250                 {
3251                         __m128i pix2, mod2;
3252                         __m128 z = _mm_loadu_ps(&buffer_z[x]);
3253                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3254                         mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3255                         data = _mm_add_ps(data, slope);
3256                         mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3257                         data = _mm_add_ps(data, slope);
3258                         mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3259                         data = _mm_add_ps(data, slope);
3260                         mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3261                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3262                                                                   _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3263                         pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3264                                                                    _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3265                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3266                         x += 3;
3267                         continue;
3268                 }
3269                 if(!pixelmask[x])
3270                         continue;
3271                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3272                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
3273                 mod = _mm_packs_epi32(mod, mod);
3274                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3275                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3276         }
3277         if(pixel == buffer_FragColorbgra8)
3278                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3279 #endif
3280 }
3281
3282
3283
3284 void DPSOFTRAST_VertexShader_Lightmap(void)
3285 {
3286         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3287         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3288         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3289 }
3290
3291 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3292 {
3293 #ifdef SSE2_PRESENT
3294         unsigned char * RESTRICT pixelmask = span->pixelmask;
3295         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3296         int x, startx = span->startx, endx = span->endx;
3297         __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3298         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3299         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3300         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3301         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3302         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3303         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3304         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3305         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3306         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3307                 pixel = buffer_FragColorbgra8;
3308         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3309         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3310         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3311         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3312         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3313         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3314         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3315         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3316         {
3317                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3318                 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3319                 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3320                 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3321                 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3322                 for (x = startx;x < endx;x++)
3323                 {
3324                         __m128i color, lightmap, glow, pix;
3325                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3326                         {
3327                                 __m128i pix2;
3328                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3329                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3330                                 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3331                                 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3332                                                                                                         _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3333                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3334                                 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3335                                                                                                         _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3336                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3337                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3338                                 x += 3;
3339                                 continue;
3340                         }
3341                         if(!pixelmask[x])
3342                                 continue;
3343                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3344                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3345                         glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3346                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3347                         pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3348                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3349                 }
3350         }
3351         else
3352         {
3353                 for (x = startx;x < endx;x++)
3354                 {
3355                         __m128i color, lightmap, pix;
3356                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3357                         {
3358                                 __m128i pix2;
3359                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3360                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3361                                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3362                                                                           _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3363                                 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3364                                                                            _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3365                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3366                                 x += 3;
3367                                 continue;
3368                         }
3369                         if(!pixelmask[x]) 
3370                                 continue;
3371                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3372                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3373                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3374                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3375                 }
3376         }
3377         if(pixel == buffer_FragColorbgra8)
3378                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3379 #endif
3380 }
3381
3382
3383
3384 void DPSOFTRAST_VertexShader_FakeLight(void)
3385 {
3386         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3387 }
3388
3389 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3390 {
3391         // TODO: IMPLEMENT
3392         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3393         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3394         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3395         memset(buffer_FragColorbgra8, 0, span->length*4);
3396         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3397 }
3398
3399
3400
3401 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3402 {
3403         DPSOFTRAST_VertexShader_Lightmap();
3404 }
3405
3406 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3407 {
3408         DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3409         // TODO: IMPLEMENT
3410 }
3411
3412
3413
3414 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3415 {
3416         DPSOFTRAST_VertexShader_Lightmap();
3417 }
3418
3419 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3420 {
3421         DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3422         // TODO: IMPLEMENT
3423 }
3424
3425
3426
3427 void DPSOFTRAST_VertexShader_LightDirection(void)
3428 {
3429         int i;
3430         int numvertices = dpsoftrast.numvertices;
3431         float LightDir[4];
3432         float LightVector[4];
3433         float EyePosition[4];
3434         float EyeVectorModelSpace[4];
3435         float EyeVector[4];
3436         float position[4];
3437         float svector[4];
3438         float tvector[4];
3439         float normal[4];
3440         LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3441         LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3442         LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3443         LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3444         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3445         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3446         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3447         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3448         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3449         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3450         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3451         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3452         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3453         for (i = 0;i < numvertices;i++)
3454         {
3455                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3456                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3457                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3458                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3459                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3460                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3461                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3462                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3463                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3464                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3465                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3466                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3467                 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3468                 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3469                 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3470                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3471                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3472                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3473                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3474                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3475                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3476                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3477                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3478                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3479                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3480                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3481                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3482                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3483                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3484         }
3485         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3486 }
3487
3488 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3489 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3490 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3491 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3492 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3493 #define DPSOFTRAST_Vector3Normalize(v)\
3494 do\
3495 {\
3496         float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3497         if (len)\
3498         {\
3499                 len = 1.0f / len;\
3500                 v[0] *= len;\
3501                 v[1] *= len;\
3502                 v[2] *= len;\
3503         }\
3504 }\
3505 while(0)
3506
3507 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3508 {
3509         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3510         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3511         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3512         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3513         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3514         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3515         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3516         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3517         int x, startx = span->startx, endx = span->endx;
3518         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3519         float LightVectordata[4];
3520         float LightVectorslope[4];
3521         float EyeVectordata[4];
3522         float EyeVectorslope[4];
3523         float z;
3524         float diffusetex[4];
3525         float glosstex[4];
3526         float surfacenormal[4];
3527         float lightnormal[4];
3528         float eyenormal[4];
3529         float specularnormal[4];
3530         float diffuse;
3531         float specular;
3532         float SpecularPower;
3533         int d[4];
3534         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3535         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3536         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3537         Color_Glow[3] = 0.0f;
3538         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3539         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3540         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3541         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3542         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3543         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3544         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3545         Color_Pants[3] = 0.0f;
3546         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3547         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3548         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3549         Color_Shirt[3] = 0.0f;
3550         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3551         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3552         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3553         {
3554                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3555                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3556         }
3557         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3558         {
3559                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3560         }
3561         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3562         {
3563                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3564                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3565                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3566                 Color_Diffuse[3] = 0.0f;
3567                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3568                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3569                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3570                 LightColor[3] = 0.0f;
3571                 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3572                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3573                 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3574                 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3575                 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3576                 Color_Specular[3] = 0.0f;
3577                 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3578                 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3579                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3580                 for (x = startx;x < endx;x++)
3581                 {
3582                         z = buffer_z[x];
3583                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3584                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3585                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3586                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3587                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3588                         {
3589                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3590                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3591                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3592                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3593                         }
3594                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3595                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3596                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3597                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3598                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3599                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3600                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3601                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3602
3603                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3604                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3605                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3606                         DPSOFTRAST_Vector3Normalize(lightnormal);
3607
3608                         eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3609                         eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3610                         eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3611                         DPSOFTRAST_Vector3Normalize(eyenormal);
3612
3613                         specularnormal[0] = lightnormal[0] + eyenormal[0];
3614                         specularnormal[1] = lightnormal[1] + eyenormal[1];
3615                         specularnormal[2] = lightnormal[2] + eyenormal[2];
3616                         DPSOFTRAST_Vector3Normalize(specularnormal);
3617
3618                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3619                         specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3620                         specular = pow(specular, SpecularPower * glosstex[3]);
3621                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3622                         {
3623                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3624                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3625                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3626                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3627                         }
3628                         else
3629                         {
3630                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3631                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3632                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3633                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3634                         }
3635                         buffer_FragColorbgra8[x*4+0] = d[0];
3636                         buffer_FragColorbgra8[x*4+1] = d[1];
3637                         buffer_FragColorbgra8[x*4+2] = d[2];
3638                         buffer_FragColorbgra8[x*4+3] = d[3];
3639                 }
3640         }
3641         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3642         {
3643                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3644                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3645                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3646                 Color_Diffuse[3] = 0.0f;
3647                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3648                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3649                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3650                 LightColor[3] = 0.0f;
3651                 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3652                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3653                 for (x = startx;x < endx;x++)
3654                 {
3655                         z = buffer_z[x];
3656                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3657                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3658                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3659                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3660                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3661                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3662                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3663                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3664
3665                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3666                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3667                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3668                         DPSOFTRAST_Vector3Normalize(lightnormal);
3669
3670                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3671                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3672                         {
3673                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3674                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3675                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3676                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3677                         }
3678                         else
3679                         {
3680                                 d[0] = (int)(                                                + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3681                                 d[1] = (int)(                                                + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3682                                 d[2] = (int)(                                                + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3683                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3684                         }
3685                         buffer_FragColorbgra8[x*4+0] = d[0];
3686                         buffer_FragColorbgra8[x*4+1] = d[1];
3687                         buffer_FragColorbgra8[x*4+2] = d[2];
3688                         buffer_FragColorbgra8[x*4+3] = d[3];
3689                 }
3690         }
3691         else
3692         {
3693                 for (x = startx;x < endx;x++)
3694                 {
3695                         z = buffer_z[x];
3696                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3697                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3698                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3699                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3700
3701                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3702                         {
3703                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3704                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3705                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3706                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3707                         }
3708                         else
3709                         {
3710                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3711                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3712                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3713                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3714                         }
3715                         buffer_FragColorbgra8[x*4+0] = d[0];
3716                         buffer_FragColorbgra8[x*4+1] = d[1];
3717                         buffer_FragColorbgra8[x*4+2] = d[2];
3718                         buffer_FragColorbgra8[x*4+3] = d[3];
3719                 }
3720         }
3721         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3722 }
3723
3724
3725
3726 void DPSOFTRAST_VertexShader_LightSource(void)
3727 {
3728         int i;
3729         int numvertices = dpsoftrast.numvertices;
3730         float LightPosition[4];
3731         float LightVector[4];
3732         float LightVectorModelSpace[4];
3733         float EyePosition[4];
3734         float EyeVectorModelSpace[4];
3735         float EyeVector[4];
3736         float position[4];
3737         float svector[4];
3738         float tvector[4];
3739         float normal[4];
3740         LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
3741         LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
3742         LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
3743         LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
3744         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3745         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3746         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3747         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3748         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3749         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3750         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3751         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3752         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3753         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3754         for (i = 0;i < numvertices;i++)
3755         {
3756                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3757                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3758                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3759                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3760                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3761                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3762                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3763                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3764                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3765                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3766                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3767                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3768                 LightVectorModelSpace[0] = LightPosition[0] - position[0];
3769                 LightVectorModelSpace[1] = LightPosition[1] - position[1];
3770                 LightVectorModelSpace[2] = LightPosition[2] - position[2];
3771                 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
3772                 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
3773                 LightVector[2] = normal[0]  * LightVectorModelSpace[0] + normal[1]  * LightVectorModelSpace[1] + normal[2]  * LightVectorModelSpace[2];
3774                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3775                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3776                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3777                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3778                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3779                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3780                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3781                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3782                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3783                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3784                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3785                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3786                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3787                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3788         }
3789         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3790         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
3791 }
3792
3793 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3794 {
3795 #ifdef SSE2_PRESENT
3796         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3797         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3798         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3799         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3800         unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3801         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3802         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3803         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3804         int x, startx = span->startx, endx = span->endx;
3805         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3806         float CubeVectordata[4];
3807         float CubeVectorslope[4];
3808         float LightVectordata[4];
3809         float LightVectorslope[4];
3810         float EyeVectordata[4];
3811         float EyeVectorslope[4];
3812         float z;
3813         float diffusetex[4];
3814         float glosstex[4];
3815         float surfacenormal[4];
3816         float lightnormal[4];
3817         float eyenormal[4];
3818         float specularnormal[4];
3819         float diffuse;
3820         float specular;
3821         float SpecularPower;
3822         float CubeVector[4];
3823         float attenuation;
3824         int d[4];
3825         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3826         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3827         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3828         Color_Glow[3] = 0.0f;
3829         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3830         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3831         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3832         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3833         Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3834         Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3835         Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3836         Color_Diffuse[3] = 0.0f;
3837         Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3838         Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3839         Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3840         Color_Specular[3] = 0.0f;
3841         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3842         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3843         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3844         Color_Pants[3] = 0.0f;
3845         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3846         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3847         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3848         Color_Shirt[3] = 0.0f;
3849         LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3850         LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3851         LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3852         LightColor[3] = 0.0f;
3853         SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3854         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3855         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3856         DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3857         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3858         memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
3859         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3860         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3861         {
3862                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3863                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3864         }
3865         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3866                 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
3867         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3868         {
3869                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3870                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3871                 for (x = startx;x < endx;x++)
3872                 {
3873                         z = buffer_z[x];
3874                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3875                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3876                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3877                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3878                         if (attenuation < 0.01f)
3879                                 continue;
3880                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3881                         {
3882                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3883                                 if (attenuation < 0.01f)
3884                                         continue;
3885                         }
3886
3887                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3888                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3889                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3890                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3891                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3892                         {
3893                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3894                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3895                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3896                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3897                         }
3898                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3899                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3900                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3901                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3902                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3903                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3904                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3905                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3906
3907                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3908                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3909                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3910                         DPSOFTRAST_Vector3Normalize(lightnormal);
3911
3912                         eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3913                         eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3914                         eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3915                         DPSOFTRAST_Vector3Normalize(eyenormal);
3916
3917                         specularnormal[0] = lightnormal[0] + eyenormal[0];
3918                         specularnormal[1] = lightnormal[1] + eyenormal[1];
3919                         specularnormal[2] = lightnormal[2] + eyenormal[2];
3920                         DPSOFTRAST_Vector3Normalize(specularnormal);
3921
3922                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3923                         specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3924                         specular = pow(specular, SpecularPower * glosstex[3]);
3925                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3926                         {
3927                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3928                                 attenuation *= (1.0f / 255.0f);
3929                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3930                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3931                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3932                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
3933                         }
3934                         else
3935                         {
3936                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
3937                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
3938                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
3939                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
3940                         }
3941                         buffer_FragColorbgra8[x*4+0] = d[0];
3942                         buffer_FragColorbgra8[x*4+1] = d[1];
3943                         buffer_FragColorbgra8[x*4+2] = d[2];
3944                         buffer_FragColorbgra8[x*4+3] = d[3];
3945                 }
3946         }
3947         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3948         {
3949                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3950                 for (x = startx;x < endx;x++)
3951                 {
3952                         z = buffer_z[x];
3953                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3954                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3955                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3956                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3957                         if (attenuation < 0.01f)
3958                                 continue;
3959                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3960                         {
3961                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3962                                 if (attenuation < 0.01f)
3963                                         continue;
3964                         }
3965
3966                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3967                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3968                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3969                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3970                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3971                         {
3972                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3973                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3974                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3975                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3976                         }
3977                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3978                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3979                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3980                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3981
3982                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3983                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3984                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3985                         DPSOFTRAST_Vector3Normalize(lightnormal);
3986
3987                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3988                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3989                         {
3990                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3991                                 attenuation *= (1.0f / 255.0f);
3992                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3993                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3994                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3995                                 d[3] = (int)( diffusetex[3]                                                                                                                   );if (d[3] > 255) d[3] = 255;
3996                         }
3997                         else
3998                         {
3999                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4000                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4001                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4002                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4003                         }
4004                         buffer_FragColorbgra8[x*4+0] = d[0];
4005                         buffer_FragColorbgra8[x*4+1] = d[1];
4006                         buffer_FragColorbgra8[x*4+2] = d[2];
4007                         buffer_FragColorbgra8[x*4+3] = d[3];
4008                 }
4009         }
4010         else
4011         {
4012                 for (x = startx;x < endx;x++)
4013                 {
4014                         z = buffer_z[x];
4015                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4016                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4017                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4018                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4019                         if (attenuation < 0.01f)
4020                                 continue;
4021                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4022                         {
4023                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4024                                 if (attenuation < 0.01f)
4025                                         continue;
4026                         }
4027
4028                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4029                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4030                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4031                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4032                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4033                         {
4034                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4035                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4036                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4037                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4038                         }
4039                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4040                         {
4041                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4042                                 attenuation *= (1.0f / 255.0f);
4043                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4044                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4045                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4046                                 d[3] = (int)( diffusetex[3]                                                                                      );if (d[3] > 255) d[3] = 255;
4047                         }
4048                         else
4049                         {
4050                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4051                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4052                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4053                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4054                         }
4055                         buffer_FragColorbgra8[x*4+0] = d[0];
4056                         buffer_FragColorbgra8[x*4+1] = d[1];
4057                         buffer_FragColorbgra8[x*4+2] = d[2];
4058                         buffer_FragColorbgra8[x*4+3] = d[3];
4059                 }
4060         }
4061         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4062 #endif
4063 }
4064
4065
4066
4067 void DPSOFTRAST_VertexShader_Refraction(void)
4068 {
4069         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4070 }
4071
4072 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4073 {
4074         // TODO: IMPLEMENT
4075         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4076         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4077         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4078         memset(buffer_FragColorbgra8, 0, span->length*4);
4079         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4080 }
4081
4082
4083
4084 void DPSOFTRAST_VertexShader_Water(void)
4085 {
4086         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4087 }
4088
4089
4090 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4091 {
4092         // TODO: IMPLEMENT
4093         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4094         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4095         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4096         memset(buffer_FragColorbgra8, 0, span->length*4);
4097         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4098 }
4099
4100
4101
4102 void DPSOFTRAST_VertexShader_ShowDepth(void)
4103 {
4104         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4105 }
4106
4107 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4108 {
4109         // TODO: IMPLEMENT
4110         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4111         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4112         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4113         memset(buffer_FragColorbgra8, 0, span->length*4);
4114         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4115 }
4116
4117
4118
4119 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4120 {
4121         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4122 }
4123
4124 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4125 {
4126         // TODO: IMPLEMENT
4127         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4128         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4129         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4130         memset(buffer_FragColorbgra8, 0, span->length*4);
4131         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4132 }
4133
4134
4135
4136 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4137 {
4138         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4139 }
4140
4141 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4142 {
4143         // TODO: IMPLEMENT
4144         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4145         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4146         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4147         memset(buffer_FragColorbgra8, 0, span->length*4);
4148         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4149 }
4150
4151
4152
4153 typedef struct DPSOFTRAST_ShaderModeInfo_s
4154 {
4155         int lodarrayindex;
4156         void (*Vertex)(void);
4157         void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4158         unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4159         unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4160 }
4161 DPSOFTRAST_ShaderModeInfo;
4162
4163 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4164 {
4165         {2, DPSOFTRAST_VertexShader_Generic,                        DPSOFTRAST_PixelShader_Generic,                        {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4166         {2, DPSOFTRAST_VertexShader_PostProcess,                    DPSOFTRAST_PixelShader_PostProcess,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4167         {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow,                DPSOFTRAST_PixelShader_Depth_Or_Shadow,                {~0}, {~0}},
4168         {2, DPSOFTRAST_VertexShader_FlatColor,                      DPSOFTRAST_PixelShader_FlatColor,                      {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4169         {2, DPSOFTRAST_VertexShader_VertexColor,                    DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4170         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4171         {2, DPSOFTRAST_VertexShader_FakeLight,                      DPSOFTRAST_PixelShader_FakeLight,                      {~0}, {~0}},
4172         {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace,   {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4173         {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4174         {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,                 {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4175         {2, DPSOFTRAST_VertexShader_LightSource,                    DPSOFTRAST_PixelShader_LightSource,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4176         {2, DPSOFTRAST_VertexShader_Refraction,                     DPSOFTRAST_PixelShader_Refraction,                     {~0}},
4177         {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                          {~0}},
4178         {2, DPSOFTRAST_VertexShader_ShowDepth,                      DPSOFTRAST_PixelShader_ShowDepth,                      {~0}},
4179         {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,               {~0}},
4180         {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}}
4181 };
4182
4183 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4184 {
4185         int i;
4186         int x;
4187         int startx;
4188         int endx;
4189 //      unsigned int c;
4190 //      unsigned int *colorpixel;
4191         unsigned int *depthpixel;
4192         float w;
4193         float wslope;
4194         int depth;
4195         int depthslope;
4196         unsigned int d;
4197         DPSOFTRAST_State_Triangle *triangle;
4198         DPSOFTRAST_State_Span *span;
4199         unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4200         for (i = 0; i < thread->numspans; i++)
4201         {
4202                 span = &thread->spans[i];
4203                 triangle = &thread->triangles[span->triangle];
4204                 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4205                 {
4206                         wslope = triangle->w[0];
4207                         w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4208                         depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4209                         depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4210                         depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4211                         switch(thread->fb_depthfunc)
4212                         {
4213                         default:
4214                         case GL_ALWAYS:  for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = true; break;
4215                         case GL_LESS:    for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4216                         case GL_LEQUAL:  for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4217                         case GL_EQUAL:   for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4218                         case GL_GEQUAL:  for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4219                         case GL_GREATER: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4220                         case GL_NEVER:   for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = false; break;
4221                         }
4222                         //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4223                         //for (x = 0;x < span->length;x++)
4224                         //      colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4225                         // if there is no color buffer, skip pixel shader
4226                         startx = 0;
4227                         endx = span->length;
4228                         while (startx < endx && !pixelmask[startx])
4229                                 startx++;
4230                         while (endx > startx && !pixelmask[endx-1])
4231                                 endx--;
4232                         if (startx >= endx)
4233                                 continue; // no pixels to fill
4234                         span->pixelmask = pixelmask;
4235                         span->startx = startx;
4236                         span->endx = endx;
4237                         // run pixel shader if appropriate
4238                         // do this before running depthmask code, to allow the pixelshader
4239                         // to clear pixelmask values for alpha testing
4240                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4241                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4242                         if (thread->depthmask)
4243                                 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4244                                         if (pixelmask[x])
4245                                                 depthpixel[x] = d;
4246                 }
4247                 else
4248                 {
4249                         // no depth testing means we're just dealing with color...
4250                         // if there is no color buffer, skip pixel shader
4251                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4252                         {
4253                                 memset(pixelmask, 1, span->length);
4254                                 span->pixelmask = pixelmask;
4255                                 span->startx = 0;
4256                                 span->endx = span->length;
4257                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4258                         }
4259                 }
4260         }
4261         thread->numspans = 0;
4262 }
4263
4264 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4265
4266 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4267 {
4268 #ifdef SSE2_PRESENT
4269         int cullface = thread->cullface;
4270         int width = dpsoftrast.fb_width;
4271         int miny = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
4272         int maxy = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
4273         __m128i fbmin, fbmax;
4274         __m128 viewportcenter, viewportscale;
4275         int firstvertex = command->firstvertex;
4276         int numvertices = command->numvertices;
4277         int numtriangles = command->numtriangles;
4278         const int *element3i = command->element3i;
4279         const unsigned short *element3s = command->element3s;
4280         int clipped = command->clipped;
4281         int i;
4282         int j;
4283         int k;
4284         int y;
4285         int e[3];
4286         __m128i screeny;
4287         int starty, endy;
4288         int numpoints;
4289         int clipcase;
4290         float clipdist[4];
4291         __m128 triangleedge1, triangleedge2, trianglenormal;
4292         __m128 clipfrac[3];
4293         __m128 screen[4];
4294         DPSOFTRAST_State_Triangle *triangle;
4295         DPSOFTRAST_Texture *texture;
4296         if (command->starty >= maxy || command->endy <= miny)
4297         {
4298                 if (!ATOMIC_DECREMENT(command->refcount))
4299                 {
4300                         if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4301                                 MM_FREE(command->arrays);
4302                 }
4303                 return;
4304         }
4305         DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4306         fbmin = _mm_setr_epi16(0, miny, 0, miny, 0, miny, 0, miny);
4307         fbmax = _mm_sub_epi16(_mm_setr_epi16(width, maxy, width, maxy, width, maxy, width, maxy), _mm_set1_epi16(1));
4308         viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4309         viewportscale = _mm_load_ps(thread->fb_viewportscale);
4310         screen[3] = _mm_setzero_ps();
4311         clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4312         for (i = 0;i < numtriangles;i++)
4313         {
4314                 const float *screencoord4f = command->arrays;
4315                 const float *arrays = screencoord4f + numvertices*4;
4316
4317                 // generate the 3 edges of this triangle
4318                 // generate spans for the triangle - switch based on left split or right split classification of triangle
4319                 if (element3s)
4320                 {
4321                         e[0] = element3s[i*3+0] - firstvertex;
4322                         e[1] = element3s[i*3+1] - firstvertex;
4323                         e[2] = element3s[i*3+2] - firstvertex;
4324                 }
4325                 else if (element3i)
4326                 {
4327                         e[0] = element3i[i*3+0] - firstvertex;
4328                         e[1] = element3i[i*3+1] - firstvertex;
4329                         e[2] = element3i[i*3+2] - firstvertex;
4330                 }
4331                 else
4332                 {
4333                         e[0] = i*3+0;
4334                         e[1] = i*3+1;
4335                         e[2] = i*3+2;
4336                 }
4337
4338 #define SKIPBACKFACE \
4339                 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4340                 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4341                 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4342                 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4343                                                                         _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4344                 switch(cullface) \
4345                 { \
4346                 case GL_BACK: \
4347                         if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4348                                 continue; \
4349                         break; \
4350                 case GL_FRONT: \
4351                         if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4352                                 continue; \
4353                         break; \
4354                 }
4355
4356 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4357                         clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4358                         { \
4359                                 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4360                                 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4361                         }
4362 #define CLIPPEDVERTEXCOPY(k,p1) \
4363                         screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4364
4365 #define GENATTRIBCOPY(attrib, p1) \
4366                 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4367 #define GENATTRIBLERP(attrib, p1, p2) \
4368                 { \
4369                         __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4370                         attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4371                 }
4372 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4373                 switch(clipcase) \
4374                 { \
4375                 default: \
4376                 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4377                 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4378                 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4379                 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4380                 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4381                 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4382                 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4383                 }
4384
4385                 if (! clipped)
4386                         goto notclipped;
4387
4388                 // calculate distance from nearplane
4389                 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4390                 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4391                 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4392                 if (clipdist[0] >= 0.0f)
4393                 {
4394                         if (clipdist[1] >= 0.0f)
4395                         {
4396                                 if (clipdist[2] >= 0.0f)
4397                                 {
4398                                 notclipped:
4399                                         // triangle is entirely in front of nearplane
4400                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4401                                         SKIPBACKFACE;
4402                                         numpoints = 3;
4403                                         clipcase = 0;
4404                                 }
4405                                 else
4406                                 {
4407                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4408                                         SKIPBACKFACE;
4409                                         numpoints = 4;
4410                                         clipcase = 1;
4411                                 }
4412                         }
4413                         else
4414                         {
4415                                 if (clipdist[2] >= 0.0f)
4416                                 {
4417                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4418                                         SKIPBACKFACE;
4419                                         numpoints = 4;
4420                                         clipcase = 2;
4421                                 }
4422                                 else
4423                                 {
4424                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4425                                         SKIPBACKFACE;
4426                                         numpoints = 3;
4427                                         clipcase = 3;
4428                                 }
4429                         }
4430                 }
4431                 else if (clipdist[1] >= 0.0f)
4432                 {
4433                         if (clipdist[2] >= 0.0f)
4434                         {
4435                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4436                                 SKIPBACKFACE;
4437                                 numpoints = 4;
4438                                 clipcase = 4;
4439                         }
4440                         else
4441                         {
4442                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4443                                 SKIPBACKFACE;
4444                                 numpoints = 3;
4445                                 clipcase = 5;
4446                         }
4447                 }
4448                 else if (clipdist[2] >= 0.0f)
4449                 {
4450                         CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4451                         SKIPBACKFACE;
4452                         numpoints = 3;
4453                         clipcase = 6;
4454                 }
4455                 else continue; // triangle is entirely behind nearplane
4456
4457                 {
4458                         // calculate integer y coords for triangle points
4459                         __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4460                                         screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4461                                         screenmin = _mm_min_epi16(screeni, screenir),
4462                                         screenmax = _mm_max_epi16(screeni, screenir);
4463                         screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4464                         screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4465                         screenmin = _mm_max_epi16(screenmin, fbmin);
4466                         screenmax = _mm_min_epi16(screenmax, fbmax);
4467                         // skip offscreen triangles
4468                         if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4469                                 continue;
4470                         starty = _mm_extract_epi16(screenmin, 1);
4471                         endy = _mm_extract_epi16(screenmax, 1)+1;
4472                         screeny = _mm_srai_epi32(screeni, 16);
4473                 }
4474
4475                 triangle = &thread->triangles[thread->numtriangles];
4476
4477                 // calculate attribute plans for triangle data...
4478                 // okay, this triangle is going to produce spans, we'd better project
4479                 // the interpolants now (this is what gives perspective texturing),
4480                 // this consists of simply multiplying all arrays by the W coord
4481                 // (which is basically 1/Z), which will be undone per-pixel
4482                 // (multiplying by Z again) to get the perspective-correct array
4483                 // values
4484                 {
4485                         __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4486                         __m128 mipedgescale, mipdensity;
4487                         attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4488                         attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4489                         attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4490                         attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4491                         attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4492                         w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4493                         w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4494                         w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4495                         attribedge1 = _mm_sub_ss(w0, w1);
4496                         attribedge2 = _mm_sub_ss(w2, w1);
4497                         attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4498                         attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4499                         x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4500                         y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4501                         attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4502                         _mm_store_ss(&triangle->w[0], attribxslope);
4503                         _mm_store_ss(&triangle->w[1], attribyslope);
4504                         _mm_store_ss(&triangle->w[2], attriborigin);
4505                         mipedgescale = _mm_setzero_ps();
4506                         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4507                         {
4508                                 __m128 attrib0, attrib1, attrib2;
4509                                 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4510                                 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4511                                         break;
4512                                 arrays += numvertices*4;
4513                                 GENATTRIBS(attrib0, attrib1, attrib2);
4514                                 attriborigin = _mm_mul_ps(attrib1, w1);
4515                                 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4516                                 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4517                                 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4518                                 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4519                                 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4520                                 _mm_stream_ps(triangle->attribs[k][0], attribxslope);
4521                                 _mm_stream_ps(triangle->attribs[k][1], attribyslope);
4522                                 _mm_stream_ps(triangle->attribs[k][2], attriborigin);
4523                                 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4524                                 {
4525                                         mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4526                                         mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4527                                         mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4528                                         mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4529                                 }
4530                         }
4531
4532                         memset(triangle->mip, 0, sizeof(triangle->mip));
4533                         for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4534                         {
4535                                 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4536                                 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4537                                         break;
4538                                 texture = thread->texbound[texunit];
4539                                 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4540                                 {
4541                                         mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4542                                         mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4543                                         mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4544                                         mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4545                                         // this will be multiplied in the texturing routine by the texture resolution
4546                                         y = _mm_cvtss_si32(mipdensity);
4547                                         if (y > 0)
4548                                         {
4549                                                 y = (int)(log((float)y)*0.5f/M_LN2);
4550                                                 if (y > texture->mipmaps - 1)
4551                                                         y = texture->mipmaps - 1;
4552                                                 triangle->mip[texunit] = y;
4553                                         }
4554                                 }
4555                         }
4556                 }
4557
4558                 for (y = starty; y < endy;)
4559                 {
4560                         __m128 xcoords, xslope;
4561                         __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
4562                         int yccmask = _mm_movemask_epi8(ycc);
4563                         int edge0p, edge0n, edge1p, edge1n;
4564                         int nexty;
4565                         if (numpoints == 4)
4566                         {
4567                                 switch(yccmask)
4568                                 {
4569                                 default:
4570                                 case 0xFFFF: /*0000*/ y = endy; continue;
4571                                 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
4572                                 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4573                                 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
4574                                 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
4575                                 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
4576                                 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
4577                                 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
4578                                 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
4579                                 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
4580                                 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
4581                                 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
4582                                 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
4583                                 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4584                                 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
4585                                 case 0x0000: /*1111*/ y++; continue;
4586                                 }
4587                         }
4588                         else
4589                         {
4590                                 switch(yccmask)
4591                                 {
4592                                 default:
4593                                 case 0xFFFF: /*000*/ y = endy; continue;
4594                                 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
4595                                 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4596                                 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
4597                                 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
4598                                 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4599                                 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
4600                                 case 0x0000: /*111*/ y++; continue;
4601                                 }
4602                         }
4603                         ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
4604                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
4605                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
4606                         nexty = _mm_extract_epi16(ycc, 0);
4607                         if(nexty >= endy) nexty = endy-1;
4608                         if (_mm_ucomigt_ss(_mm_max_ss(screen[edge0n], screen[edge0p]), _mm_min_ss(screen[edge1n], screen[edge1p])))
4609                         {
4610                                 int tmp = edge0n;
4611                                 edge0n = edge1n;
4612                                 edge1n = tmp;
4613                                 tmp = edge0p;
4614                                 edge0p = edge1p;
4615                                 edge1p = tmp;
4616                         }
4617                         xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
4618                         xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
4619                         xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
4620                                                                 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
4621                         xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
4622                         for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
4623                         {
4624                                 int startx, endx, offset;
4625                                 startx = _mm_cvtss_si32(xcoords);
4626                                 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
4627                                 if (startx < 0) startx = 0;
4628                                 if (endx > dpsoftrast.fb_width) endx = dpsoftrast.fb_width;
4629                                 if (startx >= endx) continue;
4630                                 for (offset = startx; offset < endx;)
4631                                 {
4632                                         DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
4633                                         span->triangle = thread->numtriangles;
4634                                         span->x = offset;
4635                                         span->y = y;
4636                                         span->length = endx - offset;
4637                                         if (span -> length > DPSOFTRAST_DRAW_MAXSPANLENGTH)
4638                                                 span -> length = DPSOFTRAST_DRAW_MAXSPANLENGTH;
4639                                         offset += span->length;
4640                                         if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
4641                                                 DPSOFTRAST_Draw_ProcessSpans(thread);
4642                                 }
4643                         }
4644                 }
4645
4646                 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
4647                 {
4648                         DPSOFTRAST_Draw_ProcessSpans(thread);
4649                         thread->numtriangles = 0;
4650                 }
4651         }
4652
4653         if (!ATOMIC_DECREMENT(command->refcount))
4654         {
4655                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4656                         MM_FREE(command->arrays);
4657         }
4658
4659         if (thread->numspans > 0 || thread->numtriangles > 0)
4660         {
4661                 DPSOFTRAST_Draw_ProcessSpans(thread);
4662                 thread->numtriangles = 0;
4663         }
4664 #endif
4665 }
4666
4667 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4668 {
4669         int i;
4670         int j;
4671         int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
4672         int datasize = 2*numvertices*sizeof(float[4]);
4673         DPSOFTRAST_Command_Draw *command;
4674         unsigned char *data;
4675         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4676         {
4677                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4678                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4679                         break;
4680                 datasize += numvertices*sizeof(float[4]);
4681         }
4682         if (element3s)
4683                 datasize += numtriangles*sizeof(unsigned short[3]);
4684         else if (element3i)
4685                 datasize += numtriangles*sizeof(int[3]);
4686         datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
4687         if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
4688         {
4689                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
4690                 data = (unsigned char *)MM_CALLOC(datasize, 1);
4691         }
4692         else
4693         {
4694                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
4695                 data = (unsigned char *)command + commandsize;
4696         }
4697         command->firstvertex = firstvertex;
4698         command->numvertices = numvertices;
4699         command->numtriangles = numtriangles;
4700         command->arrays = (float *)data;
4701         memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
4702         dpsoftrast.firstvertex = firstvertex;
4703         dpsoftrast.numvertices = numvertices;
4704         dpsoftrast.screencoord4f = (float *)data;
4705         data += numvertices*sizeof(float[4]);
4706         dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
4707         data += numvertices*sizeof(float[4]);
4708         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4709         {
4710                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4711                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4712                         break;
4713                 dpsoftrast.post_array4f[j] = (float *)data;
4714                 data += numvertices*sizeof(float[4]);
4715         }
4716         command->element3i = NULL;
4717         command->element3s = NULL;
4718         if (element3s)
4719         {
4720                 command->element3s = (unsigned short *)data;
4721                 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
4722         }
4723         else if (element3i)
4724         {
4725                 command->element3i = (int *)data;
4726                 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
4727         }
4728         return command;
4729 }
4730
4731 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4732 {
4733         DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
4734         DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
4735         command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
4736         command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
4737         if (command->starty >= command->endy)
4738         {
4739                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4740                         MM_FREE(command->arrays);
4741                 DPSOFTRAST_UndoCommand(command->commandsize);
4742                 return;
4743         }
4744         command->clipped = dpsoftrast.drawclipped;
4745         command->refcount = dpsoftrast.numthreads;
4746
4747 #ifdef USE_THREADS
4748         DPSOFTRAST_Draw_SyncCommands();
4749         {
4750                 int i;
4751                 int nexty = 0;
4752                 for (i = 0; i < dpsoftrast.numthreads; i++)
4753                 {
4754                         DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
4755                         int y = nexty;
4756                         nexty = ((i+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
4757                         if (command->starty < nexty && command->endy > y && thread->starving)
4758                                 SDL_CondSignal(thread->drawcond);
4759                 }
4760         }
4761 #else
4762         DPSOFTRAST_Draw_FlushThreads();
4763 #endif
4764 }
4765  
4766 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
4767 {
4768         int commandoffset = thread->commandoffset;
4769         while (commandoffset != endoffset)
4770         {
4771                 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
4772                 switch (command->opcode)
4773                 {
4774 #define INTERPCOMMAND(name) \
4775                 case DPSOFTRAST_OPCODE_##name : \
4776                         DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
4777                         commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
4778                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
4779                                 commandoffset = 0; \
4780                         break;
4781                 INTERPCOMMAND(Viewport)
4782                 INTERPCOMMAND(ClearColor)
4783                 INTERPCOMMAND(ClearDepth)
4784                 INTERPCOMMAND(ColorMask)
4785                 INTERPCOMMAND(DepthTest)
4786                 INTERPCOMMAND(ScissorTest)
4787                 INTERPCOMMAND(Scissor)
4788                 INTERPCOMMAND(BlendFunc)
4789                 INTERPCOMMAND(BlendSubtract)
4790                 INTERPCOMMAND(DepthMask)
4791                 INTERPCOMMAND(DepthFunc)
4792                 INTERPCOMMAND(DepthRange)
4793                 INTERPCOMMAND(PolygonOffset)
4794                 INTERPCOMMAND(CullFace)
4795                 INTERPCOMMAND(AlphaTest)
4796                 INTERPCOMMAND(AlphaFunc)
4797                 INTERPCOMMAND(SetTexture)
4798                 INTERPCOMMAND(SetShader)
4799                 INTERPCOMMAND(Uniform4f)
4800                 INTERPCOMMAND(UniformMatrix4f)
4801                 INTERPCOMMAND(Uniform1i)
4802
4803                 case DPSOFTRAST_OPCODE_Draw:
4804                         DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
4805                         commandoffset += command->commandsize;
4806                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
4807                                 commandoffset = 0;
4808                         thread->commandoffset = commandoffset;
4809                         break;
4810
4811                 case DPSOFTRAST_OPCODE_Reset:
4812                         commandoffset = 0;
4813                         break;
4814                 }
4815         }
4816         thread->commandoffset = commandoffset;
4817 }
4818
4819 #ifdef USE_THREADS
4820 static int DPSOFTRAST_Draw_Thread(void *data)
4821 {
4822         DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
4823         while(thread->index >= 0)
4824         {
4825                 if (thread->commandoffset != dpsoftrast.drawcommand)
4826                 {
4827                         DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);      
4828                 }
4829                 else 
4830                 {
4831                         SDL_LockMutex(thread->drawmutex);
4832                         if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
4833                         {
4834                                 if (thread->waiting) SDL_CondSignal(thread->waitcond);
4835                                 thread->starving = true;
4836                                 SDL_CondWait(thread->drawcond, thread->drawmutex);
4837                                 thread->starving = false;
4838                         }
4839                         SDL_UnlockMutex(thread->drawmutex);
4840                 }
4841         }   
4842         return 0;
4843 }
4844 #endif
4845
4846 static void DPSOFTRAST_Draw_FlushThreads(void)
4847 {
4848         DPSOFTRAST_State_Thread *thread;
4849         int i;
4850         DPSOFTRAST_Draw_SyncCommands();
4851 #ifdef USE_THREADS
4852         for (i = 0; i < dpsoftrast.numthreads; i++)
4853         {
4854                 thread = &dpsoftrast.threads[i];
4855                 if (thread->commandoffset != dpsoftrast.drawcommand)
4856                 {
4857                         SDL_LockMutex(thread->drawmutex);
4858                         if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
4859                                 SDL_CondSignal(thread->drawcond);
4860                         SDL_UnlockMutex(thread->drawmutex);
4861                 }
4862         }
4863 #endif                  
4864         for (i = 0; i < dpsoftrast.numthreads; i++)
4865         {
4866                 thread = &dpsoftrast.threads[i];
4867 #ifdef USE_THREADS
4868                 if (thread->commandoffset != dpsoftrast.drawcommand)
4869                 {
4870                         SDL_LockMutex(thread->drawmutex);
4871                         if (thread->commandoffset != dpsoftrast.drawcommand)
4872                         {
4873                                 thread->waiting = true;
4874                                 SDL_CondWait(thread->waitcond, thread->drawmutex);
4875                                 thread->waiting = false;
4876                         }
4877                         SDL_UnlockMutex(thread->drawmutex);
4878                 }
4879 #else
4880                 if (thread->commandoffset != dpsoftrast.drawcommand)
4881                         DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
4882 #endif
4883         }
4884         dpsoftrast.commandpool.usedcommands = 0;
4885 }
4886
4887 void DPSOFTRAST_Flush(void)
4888 {
4889         DPSOFTRAST_Draw_FlushThreads();
4890 }
4891
4892 void DPSOFTRAST_Finish(void)
4893 {
4894         DPSOFTRAST_Flush();
4895 }
4896
4897 void DPSOFTRAST_Init(int width, int height, int numthreads, unsigned int *colorpixels, unsigned int *depthpixels)
4898 {
4899         int i;
4900         union
4901         {
4902                 int i;
4903                 unsigned char b[4];
4904         }
4905         u;
4906         u.i = 1;
4907         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
4908         dpsoftrast.bigendian = u.b[3];
4909         dpsoftrast.fb_width = width;
4910         dpsoftrast.fb_height = height;
4911         dpsoftrast.fb_depthpixels = depthpixels;
4912         dpsoftrast.fb_colorpixels[0] = colorpixels;
4913         dpsoftrast.fb_colorpixels[1] = NULL;
4914         dpsoftrast.fb_colorpixels[1] = NULL;
4915         dpsoftrast.fb_colorpixels[1] = NULL;
4916         dpsoftrast.viewport[0] = 0;
4917         dpsoftrast.viewport[1] = 0;
4918         dpsoftrast.viewport[2] = dpsoftrast.fb_width;
4919         dpsoftrast.viewport[3] = dpsoftrast.fb_height;
4920         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
4921         dpsoftrast.texture_firstfree = 1;
4922         dpsoftrast.texture_end = 1;
4923         dpsoftrast.texture_max = 0;
4924         dpsoftrast.color[0] = 1;
4925         dpsoftrast.color[1] = 1;
4926         dpsoftrast.color[2] = 1;
4927         dpsoftrast.color[3] = 1;
4928 #ifdef USE_THREADS
4929         dpsoftrast.numthreads = bound(1, numthreads, 64);
4930 #else
4931         dpsoftrast.numthreads = 1;
4932 #endif
4933         dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
4934         for (i = 0; i < dpsoftrast.numthreads; i++)
4935         {
4936                 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
4937                 thread->index = i;
4938                 thread->cullface = GL_BACK;
4939                 thread->colormask[1] = 1;
4940                 thread->colormask[2] = 1;
4941                 thread->colormask[3] = 1;
4942                 thread->blendfunc[0] = GL_ONE;
4943                 thread->blendfunc[1] = GL_ZERO;
4944                 thread->depthmask = true;
4945                 thread->depthtest = true;
4946                 thread->depthfunc = GL_LEQUAL;
4947                 thread->scissortest = false;
4948                 thread->alphatest = false;
4949                 thread->alphafunc = GL_GREATER;
4950                 thread->alphavalue = 0.5f;
4951                 thread->viewport[0] = 0;
4952                 thread->viewport[1] = 0;
4953                 thread->viewport[2] = dpsoftrast.fb_width;
4954                 thread->viewport[3] = dpsoftrast.fb_height;
4955                 thread->scissor[0] = 0;
4956                 thread->scissor[1] = 0;
4957                 thread->scissor[2] = dpsoftrast.fb_width;
4958                 thread->scissor[3] = dpsoftrast.fb_height;
4959                 thread->depthrange[0] = 0;
4960                 thread->depthrange[1] = 1;
4961                 thread->polygonoffset[0] = 0;
4962                 thread->polygonoffset[1] = 0;
4963
4964                 thread->numspans = 0;
4965                 thread->numtriangles = 0;
4966                 thread->commandoffset = 0;
4967                 thread->waiting = false;
4968                 thread->starving = false;
4969 #ifdef USE_THREADS
4970                 thread->waitcond = SDL_CreateCond();
4971                 thread->drawcond = SDL_CreateCond();
4972                 thread->drawmutex = SDL_CreateMutex();
4973 #endif
4974
4975                 thread->validate = -1;
4976                 DPSOFTRAST_Validate(thread, -1);
4977 #ifdef USE_THREADS
4978                 thread->thread = SDL_CreateThread(DPSOFTRAST_Draw_Thread, thread);
4979 #endif
4980         }
4981 }
4982
4983 void DPSOFTRAST_Shutdown(void)
4984 {
4985         int i;
4986 #ifdef USE_THREADS
4987         if(dpsoftrast.numthreads > 0)
4988         {
4989                 DPSOFTRAST_State_Thread *thread;
4990                 for (i = 0; i < dpsoftrast.numthreads; i++)
4991                 {
4992                         thread = &dpsoftrast.threads[i];
4993                         SDL_LockMutex(thread->drawmutex);
4994                         thread->index = -1;
4995                         SDL_CondSignal(thread->drawcond);
4996                         SDL_UnlockMutex(thread->drawmutex);
4997                         SDL_WaitThread(thread->thread, NULL);
4998                         SDL_DestroyCond(thread->waitcond);
4999                         SDL_DestroyCond(thread->drawcond);
5000                         SDL_DestroyMutex(thread->drawmutex);
5001                 }
5002         }
5003 #endif
5004         for (i = 0;i < dpsoftrast.texture_end;i++)
5005                 if (dpsoftrast.texture[i].bytes)
5006                         MM_FREE(dpsoftrast.texture[i].bytes);
5007         if (dpsoftrast.texture)
5008                 free(dpsoftrast.texture);
5009         if (dpsoftrast.threads)
5010                 MM_FREE(dpsoftrast.threads);
5011         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5012 }
5013