]> git.xonotic.org Git - xonotic/darkplaces.git/blob - dpsoftrast.c
always force a flush on copy to texture
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "thread.h"
7 #include "dpsoftrast.h"
8
9 #ifndef __cplusplus
10 typedef qboolean bool;
11 #endif
12
13 #define ALIGN_SIZE 16
14 #define ATOMIC_SIZE 32
15
16 #ifdef SSE2_PRESENT
17         #if defined(__GNUC__)
18                 #define ALIGN(var) var __attribute__((__aligned__(16)))
19                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
20                 #define MEMORY_BARRIER (_mm_sfence())
21                 //(__sync_synchronize())
22                 #define ATOMIC_COUNTER volatile int
23                 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
24                 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
25                 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
26         #elif defined(_MSC_VER)
27                 #define ALIGN(var) __declspec(align(16)) var
28                 #define ATOMIC(var) __declspec(align(32)) var
29                 #define MEMORY_BARRIER (_mm_sfence())
30                 //(MemoryBarrier())
31                 #define ATOMIC_COUNTER volatile LONG
32                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
33                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
34                 #define ATOMIC_ADD(counter, val) (InterlockedExchangeAdd(&(counter), (val)))
35         #endif
36 #endif
37
38 #ifndef ALIGN
39 #define ALIGN(var) var
40 #endif
41 #ifndef ATOMIC
42 #define ATOMIC(var) var
43 #endif
44 #ifndef MEMORY_BARRIER
45 #define MEMORY_BARRIER ((void)0)
46 #endif
47 #ifndef ATOMIC_COUNTER
48 #define ATOMIC_COUNTER int
49 #endif
50 #ifndef ATOMIC_INCREMENT
51 #define ATOMIC_INCREMENT(counter) (++(counter))
52 #endif
53 #ifndef ATOMIC_DECREMENT
54 #define ATOMIC_DECREMENT(counter) (--(counter))
55 #endif
56 #ifndef ATOMIC_ADD
57 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
58 #endif
59
60 #ifdef SSE2_PRESENT
61 #include <emmintrin.h>
62
63 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
64
65 static void *MM_CALLOC(size_t nmemb, size_t size)
66 {
67         void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
68         if (ptr != NULL) memset(ptr, 0, nmemb*size);
69         return ptr;
70 }
71
72 #define MM_FREE _mm_free
73 #else
74 #define MM_MALLOC(size) malloc(size)
75 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
76 #define MM_FREE free
77 #endif
78
79 typedef enum DPSOFTRAST_ARRAY_e
80 {
81         DPSOFTRAST_ARRAY_POSITION,
82         DPSOFTRAST_ARRAY_COLOR,
83         DPSOFTRAST_ARRAY_TEXCOORD0,
84         DPSOFTRAST_ARRAY_TEXCOORD1,
85         DPSOFTRAST_ARRAY_TEXCOORD2,
86         DPSOFTRAST_ARRAY_TEXCOORD3,
87         DPSOFTRAST_ARRAY_TEXCOORD4,
88         DPSOFTRAST_ARRAY_TEXCOORD5,
89         DPSOFTRAST_ARRAY_TEXCOORD6,
90         DPSOFTRAST_ARRAY_TEXCOORD7,
91         DPSOFTRAST_ARRAY_TOTAL
92 }
93 DPSOFTRAST_ARRAY;
94
95 typedef struct DPSOFTRAST_Texture_s
96 {
97         int flags;
98         int width;
99         int height;
100         int depth;
101         int sides;
102         DPSOFTRAST_TEXTURE_FILTER filter;
103         int mipmaps;
104         int size;
105         ATOMIC_COUNTER binds;
106         unsigned char *bytes;
107         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
108 }
109 DPSOFTRAST_Texture;
110
111 #define COMMAND_SIZE ALIGN_SIZE
112 #define COMMAND_ALIGN(var) ALIGN(var)
113
114 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
115 {
116         unsigned char opcode;
117         unsigned short commandsize;
118 }
119 DPSOFTRAST_Command);
120
121 enum { DPSOFTRAST_OPCODE_Reset = 0 };
122
123 #define DEFCOMMAND(opcodeval, name, fields) \
124         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
125         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
126         { \
127                 unsigned char opcode; \
128                 unsigned short commandsize; \
129                 fields \
130         } DPSOFTRAST_Command_##name );
131
132 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
133 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
134
135 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
136 {
137         int freecommand;
138         int usedcommands;
139         ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
140 }
141 DPSOFTRAST_State_Command_Pool);
142
143 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
144 {
145         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
146         float w[3];
147         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
148 }
149 DPSOFTRAST_State_Triangle);
150
151 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
152         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
153         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
154                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
155                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
156 }
157 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
158         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
159         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
160         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
161         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
162         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
163         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
164         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
165         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
166 }
167                                         
168 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
169
170 typedef ALIGN(struct DPSOFTRAST_State_Span_s
171 {
172         int triangle; // triangle this span was generated by
173         int x; // framebuffer x coord
174         int y; // framebuffer y coord
175         int startx; // usable range (according to pixelmask)
176         int endx; // usable range (according to pixelmask)
177         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
178 }
179 DPSOFTRAST_State_Span);
180
181 #define DPSOFTRAST_DRAW_MAXSPANS 1024
182 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
183
184 #define DPSOFTRAST_VALIDATE_FB 1
185 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
186 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
187 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
188
189 typedef enum DPSOFTRAST_BLENDMODE_e
190 {
191         DPSOFTRAST_BLENDMODE_OPAQUE,
192         DPSOFTRAST_BLENDMODE_ALPHA,
193         DPSOFTRAST_BLENDMODE_ADDALPHA,
194         DPSOFTRAST_BLENDMODE_ADD,
195         DPSOFTRAST_BLENDMODE_INVMOD,
196         DPSOFTRAST_BLENDMODE_MUL,
197         DPSOFTRAST_BLENDMODE_MUL2,
198         DPSOFTRAST_BLENDMODE_SUBALPHA,
199         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
200         DPSOFTRAST_BLENDMODE_INVADD,
201         DPSOFTRAST_BLENDMODE_TOTAL
202 }
203 DPSOFTRAST_BLENDMODE;
204
205 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
206 {
207         void *thread;
208         int index;
209         
210         int cullface;
211         int colormask[4];
212         int blendfunc[2];
213         int blendsubtract;
214         int depthmask;
215         int depthtest;
216         int depthfunc;
217         int scissortest;
218         int alphatest;
219         int alphafunc;
220         float alphavalue;
221         int viewport[4];
222         int scissor[4];
223         float depthrange[2];
224         float polygonoffset[2];
225
226         int shader_mode;
227         int shader_permutation;
228
229         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
230         
231         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
232         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
233
234         // DPSOFTRAST_VALIDATE_ flags
235         int validate;
236
237         // derived values (DPSOFTRAST_VALIDATE_FB)
238         int fb_colormask;
239         int fb_scissor[4];
240         ALIGN(float fb_viewportcenter[4]);
241         ALIGN(float fb_viewportscale[4]);
242
243         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
244         int fb_depthfunc;
245
246         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
247         int fb_blendmode;
248
249         // band boundaries
250         int miny1;
251         int maxy1;
252         int miny2;
253         int maxy2;
254
255         ATOMIC(volatile int commandoffset);
256
257         volatile bool waiting;
258         volatile bool starving;
259         void *waitcond;
260         void *drawcond;
261         void *drawmutex;
262
263         int numspans;
264         int numtriangles;
265         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
266         DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
267 }
268 DPSOFTRAST_State_Thread);
269
270 typedef ATOMIC(struct DPSOFTRAST_State_s
271 {
272         int fb_width;
273         int fb_height;
274         unsigned int *fb_depthpixels;
275         unsigned int *fb_colorpixels[4];
276
277         int viewport[4];
278         ALIGN(float fb_viewportcenter[4]);
279         ALIGN(float fb_viewportscale[4]);
280
281         float color[4];
282         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
283         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
284
285         const float *pointer_vertex3f;
286         const float *pointer_color4f;
287         const unsigned char *pointer_color4ub;
288         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
289         int stride_vertex;
290         int stride_color;
291         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
292         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
293         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
294
295         int firstvertex;
296         int numvertices;
297         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
298         float *screencoord4f;
299         int drawstarty;
300         int drawendy;
301         int drawclipped;
302         
303         int shader_mode;
304         int shader_permutation;
305
306         int texture_max;
307         int texture_end;
308         int texture_firstfree;
309         DPSOFTRAST_Texture *texture;
310
311         int bigendian;
312
313         // error reporting
314         const char *errorstring;
315
316         bool usethreads;
317         int interlace;
318         int numthreads;
319         DPSOFTRAST_State_Thread *threads;
320
321         ATOMIC(volatile int drawcommand);
322
323         DPSOFTRAST_State_Command_Pool commandpool;
324 }
325 DPSOFTRAST_State);
326
327 DPSOFTRAST_State dpsoftrast;
328
329 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
330 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
331 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
332 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
333 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
334
335 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
336 {
337         fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
338         fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
339         fb_viewportcenter[3] = 0.5f;
340         fb_viewportcenter[0] = 0.0f;
341         fb_viewportscale[1] = 0.5f * viewport[2];
342         fb_viewportscale[2] = -0.5f * viewport[3];
343         fb_viewportscale[3] = 0.5f;
344         fb_viewportscale[0] = 1.0f;
345 }
346
347 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
348 {
349         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
350         // and viewport projection values
351         int x1, x2;
352         int y1, y2;
353         x1 = thread->scissor[0];
354         x2 = thread->scissor[0] + thread->scissor[2];
355         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
356         y2 = dpsoftrast.fb_height - thread->scissor[1];
357         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
358         if (x1 < 0) x1 = 0;
359         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
360         if (y1 < 0) y1 = 0;
361         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
362         thread->fb_scissor[0] = x1;
363         thread->fb_scissor[1] = y1;
364         thread->fb_scissor[2] = x2 - x1;
365         thread->fb_scissor[3] = y2 - y1;
366
367         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
368 }
369
370 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
371 {
372         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
373 }
374
375 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
376 {
377         if (thread->blendsubtract)
378         {
379                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
380                 {
381                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
382                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
383                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
384                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
385                 }
386         }
387         else
388         {       
389                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
390                 {
391                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
392                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
393                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
394                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
395                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
396                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
397                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
398                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
399                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
400                 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
401                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
402                 }
403         }
404 }
405
406 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
407
408 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
409 {
410         mask &= thread->validate;
411         if (!mask)
412                 return;
413         if (mask & DPSOFTRAST_VALIDATE_FB)
414         {
415                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
416                 DPSOFTRAST_RecalcFB(thread);
417         }
418         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
419         {
420                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
421                 DPSOFTRAST_RecalcDepthFunc(thread);
422         }
423         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
424         {
425                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
426                 DPSOFTRAST_RecalcBlendFunc(thread);
427         }
428 }
429
430 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
431 {
432         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
433                 return &dpsoftrast.texture[index];
434         return NULL;
435 }
436
437 static void DPSOFTRAST_Texture_Grow(void)
438 {
439         DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
440         DPSOFTRAST_State_Thread *thread;
441         int i;
442         int j;
443         DPSOFTRAST_Flush();
444         // expand texture array as needed
445         if (dpsoftrast.texture_max < 1024)
446                 dpsoftrast.texture_max = 1024;
447         else
448                 dpsoftrast.texture_max *= 2;
449         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
450         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
451                 if (dpsoftrast.texbound[i])
452                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
453         for (j = 0; j < dpsoftrast.numthreads; j++)
454         {
455                 thread = &dpsoftrast.threads[j];
456                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
457                         if (thread->texbound[i])
458                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
459         }
460 }
461
462 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
463 {
464         int w;
465         int h;
466         int d;
467         int size;
468         int s;
469         int texnum;
470         int mipmaps;
471         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
472         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
473         DPSOFTRAST_Texture *texture;
474         if (width*height*depth < 1)
475         {
476                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
477                 return 0;
478         }
479         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
480         {
481                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
482                 return 0;
483         }
484         switch(texformat)
485         {
486         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
487         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
488         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
489                 break;
490         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
491                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
492                 {
493                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
494                         return 0;
495                 }
496                 if (depth != 1)
497                 {
498                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
499                         return 0;
500                 }
501                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
502                 {
503                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
504                         return 0;
505                 }
506                 break;
507         }
508         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
509         {
510                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
511                 return 0;
512         }
513         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
514         {
515                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
516                 return 0;
517         }
518         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
519         {
520                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
521                 return 0;
522         }
523         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
524         {
525                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
526                 return 0;
527         }
528         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
529         {
530                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
531                 return 0;
532         }
533         // find first empty slot in texture array
534         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
535                 if (!dpsoftrast.texture[texnum].bytes)
536                         break;
537         dpsoftrast.texture_firstfree = texnum + 1;
538         if (dpsoftrast.texture_max <= texnum)
539                 DPSOFTRAST_Texture_Grow();
540         if (dpsoftrast.texture_end <= texnum)
541                 dpsoftrast.texture_end = texnum + 1;
542         texture = &dpsoftrast.texture[texnum];
543         memset(texture, 0, sizeof(*texture));
544         texture->flags = flags;
545         texture->width = width;
546         texture->height = height;
547         texture->depth = depth;
548         texture->sides = sides;
549         texture->binds = 0;
550         w = width;
551         h = height;
552         d = depth;
553         size = 0;
554         mipmaps = 0;
555         w = width;
556         h = height;
557         d = depth;
558         for (;;)
559         {
560                 s = w * h * d * sides * 4;
561                 texture->mipmap[mipmaps][0] = size;
562                 texture->mipmap[mipmaps][1] = s;
563                 texture->mipmap[mipmaps][2] = w;
564                 texture->mipmap[mipmaps][3] = h;
565                 texture->mipmap[mipmaps][4] = d;
566                 size += s;
567                 mipmaps++;
568                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
569                         break;
570                 if (w > 1) w >>= 1;
571                 if (h > 1) h >>= 1;
572                 if (d > 1) d >>= 1;
573         }
574         texture->mipmaps = mipmaps;
575         texture->size = size;
576
577         // allocate the pixels now
578         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
579
580         return texnum;
581 }
582 void DPSOFTRAST_Texture_Free(int index)
583 {
584         DPSOFTRAST_Texture *texture;
585         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
586         if (texture->binds)
587                 DPSOFTRAST_Flush();
588         if (texture->bytes)
589                 MM_FREE(texture->bytes);
590         texture->bytes = NULL;
591         memset(texture, 0, sizeof(*texture));
592         // adjust the free range and used range
593         if (dpsoftrast.texture_firstfree > index)
594                 dpsoftrast.texture_firstfree = index;
595         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
596                 dpsoftrast.texture_end--;
597 }
598 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
599 {
600         int i, x, y, z, w, layer0, layer1, row0, row1;
601         unsigned char *o, *i0, *i1, *i2, *i3;
602         DPSOFTRAST_Texture *texture;
603         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
604         if (texture->mipmaps <= 1)
605                 return;
606         for (i = 1;i < texture->mipmaps;i++)
607         {
608                 for (z = 0;z < texture->mipmap[i][4];z++)
609                 {
610                         layer0 = z*2;
611                         layer1 = z*2+1;
612                         if (layer1 >= texture->mipmap[i-1][4])
613                                 layer1 = texture->mipmap[i-1][4]-1;
614                         for (y = 0;y < texture->mipmap[i][3];y++)
615                         {
616                                 row0 = y*2;
617                                 row1 = y*2+1;
618                                 if (row1 >= texture->mipmap[i-1][3])
619                                         row1 = texture->mipmap[i-1][3]-1;
620                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
621                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
622                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
623                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
624                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
625                                 w = texture->mipmap[i][2];
626                                 if (layer1 > layer0)
627                                 {
628                                         if (texture->mipmap[i-1][2] > 1)
629                                         {
630                                                 // average 3D texture
631                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
632                                                 {
633                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
634                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
635                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
636                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
637                                                 }
638                                         }
639                                         else
640                                         {
641                                                 // average 3D mipmap with parent width == 1
642                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
643                                                 {
644                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
645                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
646                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
647                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
648                                                 }
649                                         }
650                                 }
651                                 else
652                                 {
653                                         if (texture->mipmap[i-1][2] > 1)
654                                         {
655                                                 // average 2D texture (common case)
656                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
657                                                 {
658                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
659                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
660                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
661                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
662                                                 }
663                                         }
664                                         else
665                                         {
666                                                 // 2D texture with parent width == 1
667                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
668                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
669                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
670                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
671                                         }
672                                 }
673                         }
674                 }
675         }
676 }
677 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
678 {
679         DPSOFTRAST_Texture *texture;
680         unsigned char *dst;
681         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
682         if (texture->binds)
683                 DPSOFTRAST_Flush();
684         dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
685         while (blockheight > 0)
686         {
687                 memcpy(dst, pixels, blockwidth * 4);
688                 pixels += blockwidth * 4;
689                 dst += texture->mipmap[0][2] * 4;
690                 blockheight--;
691         }
692         DPSOFTRAST_Texture_CalculateMipmaps(index);
693 }
694 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
695 {
696         DPSOFTRAST_Texture *texture;
697         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
698         if (texture->binds)
699                 DPSOFTRAST_Flush();
700         memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
701         DPSOFTRAST_Texture_CalculateMipmaps(index);
702 }
703 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
704 {
705         DPSOFTRAST_Texture *texture;
706         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
707         return texture->mipmap[mip][2];
708 }
709 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
710 {
711         DPSOFTRAST_Texture *texture;
712         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
713         return texture->mipmap[mip][3];
714 }
715 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
716 {
717         DPSOFTRAST_Texture *texture;
718         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
719         return texture->mipmap[mip][4];
720 }
721 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
722 {
723         DPSOFTRAST_Texture *texture;
724         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
725         if (texture->binds)
726                 DPSOFTRAST_Flush();
727         return texture->bytes + texture->mipmap[mip][0];
728 }
729 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
730 {
731         DPSOFTRAST_Texture *texture;
732         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
733         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
734         {
735                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
736                 return;
737         }
738         if (texture->binds)
739                 DPSOFTRAST_Flush();
740         texture->filter = filter;
741 }
742
743 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
744 {
745         if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
746                 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
747                 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
748                 DPSOFTRAST_Flush();
749         dpsoftrast.fb_width = width;
750         dpsoftrast.fb_height = height;
751         dpsoftrast.fb_depthpixels = depthpixels;
752         dpsoftrast.fb_colorpixels[0] = colorpixels0;
753         dpsoftrast.fb_colorpixels[1] = colorpixels1;
754         dpsoftrast.fb_colorpixels[2] = colorpixels2;
755         dpsoftrast.fb_colorpixels[3] = colorpixels3;
756 }
757
758 static void DPSOFTRAST_Draw_FlushThreads(void);
759
760 static void DPSOFTRAST_Draw_SyncCommands(void)
761 {
762         if(dpsoftrast.usethreads) MEMORY_BARRIER;
763         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
764 }
765
766 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
767 {
768         DPSOFTRAST_State_Thread *thread;
769         int i;
770         int freecommand = dpsoftrast.commandpool.freecommand;
771         int usedcommands = dpsoftrast.commandpool.usedcommands;
772         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
773                 return;
774         DPSOFTRAST_Draw_SyncCommands();
775         for(;;)
776         {
777                 int waitindex = -1;
778                 int commandoffset;
779                 usedcommands = 0;
780                 for (i = 0; i < dpsoftrast.numthreads; i++)
781                 {
782                         thread = &dpsoftrast.threads[i]; 
783                         commandoffset = freecommand - thread->commandoffset;
784                         if (commandoffset < 0)
785                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
786                         if (commandoffset > usedcommands)
787                         {
788                                 waitindex = i;
789                                 usedcommands = commandoffset;
790                         }
791                 }
792                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
793                         break;
794                 thread = &dpsoftrast.threads[waitindex];
795                 Thread_LockMutex(thread->drawmutex);
796                 if (thread->commandoffset != dpsoftrast.drawcommand)
797                 {
798                         thread->waiting = true;
799                         if (thread->starving) Thread_CondSignal(thread->drawcond);
800                         Thread_CondWait(thread->waitcond, thread->drawmutex);
801                         thread->waiting = false;
802                 }
803                 Thread_UnlockMutex(thread->drawmutex);
804         }
805         dpsoftrast.commandpool.usedcommands = usedcommands;
806 }
807
808 #define DPSOFTRAST_ALIGNCOMMAND(size) \
809         ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
810 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
811         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
812
813 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
814 {
815         DPSOFTRAST_Command *command;
816         int freecommand = dpsoftrast.commandpool.freecommand;
817         int usedcommands = dpsoftrast.commandpool.usedcommands;
818         int extra = sizeof(DPSOFTRAST_Command);
819         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
820                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
821         if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
822         {
823                 if (dpsoftrast.usethreads)
824                         DPSOFTRAST_Draw_FreeCommandPool(size + extra);
825                 else
826                         DPSOFTRAST_Draw_FlushThreads();
827                 freecommand = dpsoftrast.commandpool.freecommand;
828                 usedcommands = dpsoftrast.commandpool.usedcommands;
829         }
830         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
831         {
832                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
833                 command->opcode = DPSOFTRAST_OPCODE_Reset;
834                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
835                 freecommand = 0;
836         }
837         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
838         command->opcode = opcode;
839         command->commandsize = size;
840         freecommand += size;
841         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
842                 freecommand = 0;
843         dpsoftrast.commandpool.freecommand = freecommand;
844         dpsoftrast.commandpool.usedcommands = usedcommands + size;
845         return command;
846 }
847
848 static void DPSOFTRAST_UndoCommand(int size)
849 {
850         int freecommand = dpsoftrast.commandpool.freecommand;
851         int usedcommands = dpsoftrast.commandpool.usedcommands;
852         freecommand -= size;
853         if (freecommand < 0)
854                 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
855         usedcommands -= size;
856         dpsoftrast.commandpool.freecommand = freecommand;
857         dpsoftrast.commandpool.usedcommands = usedcommands;
858 }
859                 
860 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
861 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
862 {
863         thread->viewport[0] = command->x;
864         thread->viewport[1] = command->y;
865         thread->viewport[2] = command->width;
866         thread->viewport[3] = command->height;
867         thread->validate |= DPSOFTRAST_VALIDATE_FB;
868 }
869 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
870 {
871         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
872         command->x = x;
873         command->y = y;
874         command->width = width;
875         command->height = height;
876
877         dpsoftrast.viewport[0] = x;
878         dpsoftrast.viewport[1] = y;
879         dpsoftrast.viewport[2] = width;
880         dpsoftrast.viewport[3] = height;
881         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
882 }
883
884 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
885 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
886 {
887         int i, x1, y1, x2, y2, w, h, x, y;
888         int miny1 = thread->miny1;
889         int maxy1 = thread->maxy1;
890         int miny2 = thread->miny2;
891         int maxy2 = thread->maxy2;
892         int bandy;
893         unsigned int *p;
894         unsigned int c;
895         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
896         x1 = thread->fb_scissor[0];
897         y1 = thread->fb_scissor[1];
898         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
899         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
900         if (y1 < miny1) y1 = miny1;
901         if (y2 > maxy2) y2 = maxy2;
902         w = x2 - x1;
903         h = y2 - y1;
904         if (w < 1 || h < 1)
905                 return;
906         // FIXME: honor fb_colormask?
907         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
908         for (i = 0;i < 4;i++)
909         {
910                 if (!dpsoftrast.fb_colorpixels[i])
911                         continue;
912                 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
913                 for (;y < bandy;y++)
914                 {
915                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
916                         for (x = x1;x < x2;x++)
917                                 p[x] = c;
918                 }
919         }
920 }
921 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
922 {
923         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
924         command->r = r;
925         command->g = g;
926         command->b = b;
927         command->a = a;
928 }
929
930 DEFCOMMAND(3, ClearDepth, float depth;)
931 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
932 {
933         int x1, y1, x2, y2, w, h, x, y;
934         int miny1 = thread->miny1;
935         int maxy1 = thread->maxy1;
936         int miny2 = thread->miny2;
937         int maxy2 = thread->maxy2;
938         int bandy;
939         unsigned int *p;
940         unsigned int c;
941         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
942         x1 = thread->fb_scissor[0];
943         y1 = thread->fb_scissor[1];
944         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
945         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
946         if (y1 < miny1) y1 = miny1;
947         if (y2 > maxy2) y2 = maxy2;
948         w = x2 - x1;
949         h = y2 - y1;
950         if (w < 1 || h < 1)
951                 return;
952         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
953         for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
954         for (;y < bandy;y++)
955         {
956                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
957                 for (x = x1;x < x2;x++)
958                         p[x] = c;
959         }
960 }
961 void DPSOFTRAST_ClearDepth(float d)
962 {
963         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
964         command->depth = d;
965 }
966
967 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
968 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
969 {
970         thread->colormask[0] = command->r != 0;
971         thread->colormask[1] = command->g != 0;
972         thread->colormask[2] = command->b != 0;
973         thread->colormask[3] = command->a != 0;
974         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
975 }
976 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
977 {
978         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
979         command->r = r;
980         command->g = g;
981         command->b = b;
982         command->a = a;
983 }
984
985 DEFCOMMAND(5, DepthTest, int enable;)
986 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
987 {
988         thread->depthtest = command->enable;
989         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
990 }
991 void DPSOFTRAST_DepthTest(int enable)
992 {
993         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
994         command->enable = enable;
995 }
996
997 DEFCOMMAND(6, ScissorTest, int enable;)
998 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
999 {
1000         thread->scissortest = command->enable;
1001         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1002 }
1003 void DPSOFTRAST_ScissorTest(int enable)
1004 {
1005         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1006         command->enable = enable;
1007 }
1008
1009 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1010 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1011 {
1012         thread->scissor[0] = command->x;
1013         thread->scissor[1] = command->y;
1014         thread->scissor[2] = command->width;
1015         thread->scissor[3] = command->height;
1016         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1017 }
1018 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1019 {
1020         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1021         command->x = x;
1022         command->y = y;
1023         command->width = width;
1024         command->height = height;
1025 }
1026
1027 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1028 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1029 {
1030         thread->blendfunc[0] = command->sfactor;
1031         thread->blendfunc[1] = command->dfactor;
1032         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1033 }
1034 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1035 {
1036         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1037         command->sfactor = sfactor;
1038         command->dfactor = dfactor;
1039 }
1040
1041 DEFCOMMAND(9, BlendSubtract, int enable;)
1042 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1043 {
1044         thread->blendsubtract = command->enable;
1045         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1046 }
1047 void DPSOFTRAST_BlendSubtract(int enable)
1048 {
1049         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1050         command->enable = enable;
1051 }
1052
1053 DEFCOMMAND(10, DepthMask, int enable;)
1054 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1055 {
1056         thread->depthmask = command->enable;
1057 }
1058 void DPSOFTRAST_DepthMask(int enable)
1059 {
1060         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1061         command->enable = enable;
1062 }
1063
1064 DEFCOMMAND(11, DepthFunc, int func;)
1065 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1066 {
1067         thread->depthfunc = command->func;
1068 }
1069 void DPSOFTRAST_DepthFunc(int func)
1070 {
1071         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1072         command->func = func;
1073 }
1074
1075 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1076 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1077 {
1078         thread->depthrange[0] = command->nearval;
1079         thread->depthrange[1] = command->farval;
1080 }
1081 void DPSOFTRAST_DepthRange(float nearval, float farval)
1082 {
1083         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1084         command->nearval = nearval;
1085         command->farval = farval;
1086 }
1087
1088 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1089 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1090 {
1091         thread->polygonoffset[0] = command->alongnormal;
1092         thread->polygonoffset[1] = command->intoview;
1093 }
1094 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1095 {
1096         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1097         command->alongnormal = alongnormal;
1098         command->intoview = intoview;
1099 }
1100
1101 DEFCOMMAND(14, CullFace, int mode;)
1102 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1103 {
1104         thread->cullface = command->mode;
1105 }
1106 void DPSOFTRAST_CullFace(int mode)
1107 {
1108         DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1109         command->mode = mode;
1110 }
1111
1112 DEFCOMMAND(15, AlphaTest, int enable;)
1113 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1114 {
1115         thread->alphatest = command->enable;
1116 }
1117 void DPSOFTRAST_AlphaTest(int enable)
1118 {
1119         DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1120         command->enable = enable;
1121 }
1122
1123 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1124 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1125 {
1126         thread->alphafunc = command->func;
1127         thread->alphavalue = command->ref;
1128 }
1129 void DPSOFTRAST_AlphaFunc(int func, float ref)
1130 {
1131         DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1132         command->func = func;
1133         command->ref = ref;
1134 }
1135
1136 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1137 {
1138         dpsoftrast.color[0] = r;
1139         dpsoftrast.color[1] = g;
1140         dpsoftrast.color[2] = b;
1141         dpsoftrast.color[3] = a;
1142 }
1143
1144 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1145 {
1146         int outstride = blockwidth * 4;
1147         int instride = dpsoftrast.fb_width * 4;
1148         int bx1 = blockx;
1149         int by1 = blocky;
1150         int bx2 = blockx + blockwidth;
1151         int by2 = blocky + blockheight;
1152         int bw;
1153         int bh;
1154         int x;
1155         int y;
1156         unsigned char *inpixels;
1157         unsigned char *b;
1158         unsigned char *o;
1159         DPSOFTRAST_Flush();
1160         if (bx1 < 0) bx1 = 0;
1161         if (by1 < 0) by1 = 0;
1162         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1163         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1164         bw = bx2 - bx1;
1165         bh = by2 - by1;
1166         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1167         if (dpsoftrast.bigendian)
1168         {
1169                 for (y = by1;y < by2;y++)
1170                 {
1171                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1172                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1173                         for (x = bx1;x < bx2;x++)
1174                         {
1175                                 o[0] = b[3];
1176                                 o[1] = b[2];
1177                                 o[2] = b[1];
1178                                 o[3] = b[0];
1179                                 o += 4;
1180                                 b += 4;
1181                         }
1182                 }
1183         }
1184         else
1185         {
1186                 for (y = by1;y < by2;y++)
1187                 {
1188                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1189                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1190                         memcpy(o, b, bw*4);
1191                 }
1192         }
1193
1194 }
1195 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1196 {
1197         int tx1 = tx;
1198         int ty1 = ty;
1199         int tx2 = tx + width;
1200         int ty2 = ty + height;
1201         int sx1 = sx;
1202         int sy1 = sy;
1203         int sx2 = sx + width;
1204         int sy2 = sy + height;
1205         int swidth;
1206         int sheight;
1207         int twidth;
1208         int theight;
1209         int sw;
1210         int sh;
1211         int tw;
1212         int th;
1213         int y;
1214         unsigned int *spixels;
1215         unsigned int *tpixels;
1216         DPSOFTRAST_Texture *texture;
1217         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1218         if (mip < 0 || mip >= texture->mipmaps) return;
1219         DPSOFTRAST_Flush();
1220         spixels = dpsoftrast.fb_colorpixels[0];
1221         swidth = dpsoftrast.fb_width;
1222         sheight = dpsoftrast.fb_height;
1223         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1224         twidth = texture->mipmap[mip][2];
1225         theight = texture->mipmap[mip][3];
1226         if (tx1 < 0) tx1 = 0;
1227         if (ty1 < 0) ty1 = 0;
1228         if (tx2 > twidth) tx2 = twidth;
1229         if (ty2 > theight) ty2 = theight;
1230         if (sx1 < 0) sx1 = 0;
1231         if (sy1 < 0) sy1 = 0;
1232         if (sx2 > swidth) sx2 = swidth;
1233         if (sy2 > sheight) sy2 = sheight;
1234         tw = tx2 - tx1;
1235         th = ty2 - ty1;
1236         sw = sx2 - sx1;
1237         sh = sy2 - sy1;
1238         if (tw > sw) tw = sw;
1239         if (th > sh) th = sh;
1240         if (tw < 1 || th < 1)
1241                 return;
1242         for (y = 0;y < th;y++)
1243                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 + y) * swidth + sx1), tw*4);
1244         if (texture->mipmaps > 1)
1245                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1246 }
1247
1248 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1249 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1250 {
1251         if (thread->texbound[command->unitnum])
1252                 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1253         thread->texbound[command->unitnum] = command->texture;
1254 }
1255 void DPSOFTRAST_SetTexture(int unitnum, int index)
1256 {
1257         DPSOFTRAST_Command_SetTexture *command;
1258         DPSOFTRAST_Texture *texture;
1259         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1260         {
1261                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1262                 return;
1263         }
1264         texture = DPSOFTRAST_Texture_GetByIndex(index);
1265         if (index && !texture)
1266         {
1267                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1268                 return;
1269         }
1270
1271         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1272         command->unitnum = unitnum;
1273         command->texture = texture;
1274
1275         dpsoftrast.texbound[unitnum] = texture;
1276         ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1277 }
1278
1279 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1280 {
1281         dpsoftrast.pointer_vertex3f = vertex3f;
1282         dpsoftrast.stride_vertex = stride;
1283 }
1284 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1285 {
1286         dpsoftrast.pointer_color4f = color4f;
1287         dpsoftrast.pointer_color4ub = NULL;
1288         dpsoftrast.stride_color = stride;
1289 }
1290 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1291 {
1292         dpsoftrast.pointer_color4f = NULL;
1293         dpsoftrast.pointer_color4ub = color4ub;
1294         dpsoftrast.stride_color = stride;
1295 }
1296 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1297 {
1298         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1299         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1300         dpsoftrast.stride_texcoord[unitnum] = stride;
1301 }
1302
1303 DEFCOMMAND(18, SetShader, int mode; int permutation;)
1304 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1305 {
1306         thread->shader_mode = command->mode;
1307         thread->shader_permutation = command->permutation;
1308 }
1309 void DPSOFTRAST_SetShader(int mode, int permutation)
1310 {
1311         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1312         command->mode = mode;
1313         command->permutation = permutation;
1314
1315         dpsoftrast.shader_mode = mode;
1316         dpsoftrast.shader_permutation = permutation;
1317 }
1318
1319 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1320 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1321 {
1322         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1323 }
1324 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1325 {
1326         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1327         command->index = index;
1328         command->val[0] = v0;
1329         command->val[1] = v1;
1330         command->val[2] = v2;
1331         command->val[3] = v3;
1332
1333         dpsoftrast.uniform4f[index*4+0] = v0;
1334         dpsoftrast.uniform4f[index*4+1] = v1;
1335         dpsoftrast.uniform4f[index*4+2] = v2;
1336         dpsoftrast.uniform4f[index*4+3] = v3;
1337 }
1338 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1339 {
1340         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1341         command->index = index;
1342         memcpy(command->val, v, sizeof(command->val));
1343
1344         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1345 }
1346
1347 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1348 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1349 {
1350         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1351 }
1352 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1353 {
1354 #ifdef SSE2_PRESENT
1355         int i, index;
1356         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1357         {
1358                 __m128 m0, m1, m2, m3;
1359                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1360                 command->index = index;
1361                 if (((size_t)v)&(ALIGN_SIZE-1))
1362                 {
1363                         m0 = _mm_loadu_ps(v);
1364                         m1 = _mm_loadu_ps(v+4);
1365                         m2 = _mm_loadu_ps(v+8);
1366                         m3 = _mm_loadu_ps(v+12);
1367                 }
1368                 else
1369                 {
1370                         m0 = _mm_load_ps(v);
1371                         m1 = _mm_load_ps(v+4);
1372                         m2 = _mm_load_ps(v+8);
1373                         m3 = _mm_load_ps(v+12);
1374                 }
1375                 if (transpose)
1376                 {
1377                         __m128 t0, t1, t2, t3;
1378                         t0 = _mm_unpacklo_ps(m0, m1);
1379                         t1 = _mm_unpacklo_ps(m2, m3);
1380                         t2 = _mm_unpackhi_ps(m0, m1);
1381                         t3 = _mm_unpackhi_ps(m2, m3);
1382                         m0 = _mm_movelh_ps(t0, t1);
1383                         m1 = _mm_movehl_ps(t1, t0);
1384                         m2 = _mm_movelh_ps(t2, t3);
1385                         m3 = _mm_movehl_ps(t3, t2);                     
1386                 }
1387                 _mm_store_ps(command->val, m0);
1388                 _mm_store_ps(command->val+4, m1);
1389                 _mm_store_ps(command->val+8, m2);
1390                 _mm_store_ps(command->val+12, m3);
1391                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1392                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1393                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1394                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1395         }
1396 #endif
1397 }
1398
1399 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1400 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1401 {
1402         thread->uniform1i[command->index] = command->val;
1403 }
1404 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1405 {
1406         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1407         command->index = index;
1408         command->val = i0;
1409
1410         dpsoftrast.uniform1i[command->index] = i0;
1411 }
1412
1413 #ifdef SSE2_PRESENT
1414 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1415 {
1416         float *end = dst + size*4;
1417         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1418         {
1419                 while (dst < end)
1420                 {
1421                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1422                         dst += 4;
1423                         src += stride;
1424                 }
1425         }
1426         else
1427         {
1428                 while (dst < end)
1429                 {
1430                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1431                         dst += 4;
1432                         src += stride;
1433                 }
1434         }
1435 }
1436
1437 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1438 {
1439         float *end = dst + size*4;
1440         if (stride == sizeof(float[3]))
1441         {
1442                 float *end4 = dst + (size&~3)*4;        
1443                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1444                 {
1445                         while (dst < end4)
1446                         {
1447                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1448                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1449                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1450                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1451                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1452                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1453                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1454                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1455                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1456                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1457                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1458                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1459                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1460                                 dst += 16;
1461                                 src += 4*sizeof(float[3]);
1462                         }
1463                 }
1464                 else
1465                 {
1466                         while (dst < end4)
1467                         {
1468                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1469                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1470                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1471                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1472                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1473                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1474                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1475                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1476                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1477                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1478                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1479                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1480                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1481                                 dst += 16;
1482                                 src += 4*sizeof(float[3]);
1483                         }
1484                 }
1485         }
1486         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1487         {
1488                 while (dst < end)
1489                 {
1490                         __m128 v = _mm_loadu_ps((const float *)src);
1491                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1492                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1493                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1494                         _mm_store_ps(dst, v);
1495                         dst += 4;
1496                         src += stride;
1497                 }
1498         }
1499         else
1500         {
1501                 while (dst < end)
1502                 {
1503                         __m128 v = _mm_load_ps((const float *)src);
1504                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1505                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1506                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1507                         _mm_store_ps(dst, v);
1508                         dst += 4;
1509                         src += stride;
1510                 }
1511         }
1512 }
1513
1514 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1515 {
1516         float *end = dst + size*4;
1517         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1518         if (stride == sizeof(float[2]))
1519         {
1520                 float *end2 = dst + (size&~1)*4;
1521                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1522                 {
1523                         while (dst < end2)
1524                         {
1525                                 __m128 v = _mm_loadu_ps((const float *)src);
1526                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1527                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1528                                 dst += 8;
1529                                 src += 2*sizeof(float[2]);
1530                         }
1531                 }
1532                 else
1533                 {
1534                         while (dst < end2)
1535                         {
1536                                 __m128 v = _mm_load_ps((const float *)src);
1537                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1538                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1539                                 dst += 8;
1540                                 src += 2*sizeof(float[2]);
1541                         }
1542                 }
1543         }
1544         while (dst < end)
1545         {
1546                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1547                 dst += 4;
1548                 src += stride;
1549         }
1550 }
1551
1552 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1553 {
1554         float *end = dst + size*4;
1555         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1556         if (stride == sizeof(unsigned char[4]))
1557         {
1558                 float *end4 = dst + (size&~3)*4;
1559                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1560                 {
1561                         while (dst < end4)
1562                         {
1563                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1564                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1565                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1566                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1567                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1568                                 dst += 16;
1569                                 src += 4*sizeof(unsigned char[4]);
1570                         }
1571                 }
1572                 else
1573                 {
1574                         while (dst < end4)
1575                         {
1576                                 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1577                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1578                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1579                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1580                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1581                                 dst += 16;
1582                                 src += 4*sizeof(unsigned char[4]);
1583                         }
1584                 }
1585         }
1586         while (dst < end)
1587         {
1588                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1589                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1590                 dst += 4;
1591                 src += stride;
1592         }
1593 }
1594
1595 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1596 {
1597         float *end = dst + 4*size;
1598         __m128 v = _mm_loadu_ps(src);
1599         while (dst < end)
1600         {
1601                 _mm_store_ps(dst, v);
1602                 dst += 4;
1603         }
1604 }
1605 #endif
1606
1607 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1608 {
1609 #ifdef SSE2_PRESENT
1610         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1611         __m128 m0, m1, m2, m3;
1612         float *end;
1613         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1614         {
1615                 // fast case for identity matrix
1616                 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1617                 return;
1618         }
1619         end = out4f + numitems*4;
1620         m0 = _mm_loadu_ps(inmatrix16f);
1621         m1 = _mm_loadu_ps(inmatrix16f + 4);
1622         m2 = _mm_loadu_ps(inmatrix16f + 8);
1623         m3 = _mm_loadu_ps(inmatrix16f + 12);
1624         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1625         {
1626                 while (out4f < end)
1627                 {
1628                         __m128 v = _mm_loadu_ps(in4f);
1629                         _mm_store_ps(out4f,
1630                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1631                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1632                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1633                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1634                         out4f += 4;
1635                         in4f += 4;
1636                 }
1637         }
1638         else
1639         {
1640                 while (out4f < end)
1641                 {
1642                         __m128 v = _mm_load_ps(in4f);
1643                         _mm_store_ps(out4f,
1644                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1645                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1646                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1647                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1648                         out4f += 4;
1649                         in4f += 4;
1650                 }
1651         }
1652 #endif
1653 }
1654
1655 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1656 {
1657         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1658 }
1659
1660 #ifdef SSE2_PRESENT
1661 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1662 { \
1663         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1664         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1665         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1666         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1667 }
1668
1669 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1670 { \
1671         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1672         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1673         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1674         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1675 }
1676
1677 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1678 { \
1679         __m128 p = (in); \
1680         out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1681                                                   _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1682                                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1683                                                                                         _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1684 }
1685
1686 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, __m128 minpos, __m128 maxpos, __m128 viewportcenter, __m128 viewportscale, __m128 m0, __m128 m1, __m128 m2, __m128 m3)
1687 {
1688         int clipmask = 0xFF;
1689         __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1690         m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1691         m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1692         m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1693         m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1694         #define BBFRONT(k, pos) \
1695         { \
1696                 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1697                 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1698                 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1699                 { \
1700                         __m128 proj; \
1701                         clipmask &= ~(1<<k); \
1702                         proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1703                         minproj = _mm_min_ss(minproj, proj); \
1704                         maxproj = _mm_max_ss(maxproj, proj); \
1705                 } \
1706         }
1707         BBFRONT(0, minpos); 
1708         BBFRONT(1, _mm_move_ss(minpos, maxpos)); 
1709         BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1710         BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1711         BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1712         BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1713         BBFRONT(6, _mm_move_ss(maxpos, minpos)); 
1714         BBFRONT(7, maxpos);
1715         #define BBCLIP(k) \
1716         { \
1717                 if (clipmask&(1<<k)) \
1718                 { \
1719                         if (!(clipmask&(1<<(k^1)))) \
1720                         { \
1721                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1722                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1723                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1724                                 minproj = _mm_min_ss(minproj, proj); \
1725                                 maxproj = _mm_max_ss(maxproj, proj); \
1726                         } \
1727                         if (!(clipmask&(1<<(k^2)))) \
1728                         { \
1729                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1730                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1731                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1732                                 minproj = _mm_min_ss(minproj, proj); \
1733                                 maxproj = _mm_max_ss(maxproj, proj); \
1734                         } \
1735                         if (!(clipmask&(1<<(k^4)))) \
1736                         { \
1737                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1738                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1739                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1740                                 minproj = _mm_min_ss(minproj, proj); \
1741                                 maxproj = _mm_max_ss(maxproj, proj); \
1742                         } \
1743                 } \
1744         }
1745         BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1746         viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1747         viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1748         minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1749         maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1750         minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1751         maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1752         *starty = _mm_cvttss_si32(maxproj);
1753         *endy = _mm_cvttss_si32(minproj)+1;
1754         return clipmask;
1755 }
1756         
1757 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1758 {
1759         float *end = out4f + numitems*4;
1760         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1761         __m128 minpos, maxpos;
1762         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1763         {
1764                 minpos = maxpos = _mm_loadu_ps(in4f);
1765                 while (out4f < end)
1766                 {
1767                         __m128 v = _mm_loadu_ps(in4f);
1768                         minpos = _mm_min_ps(minpos, v);
1769                         maxpos = _mm_max_ps(maxpos, v);
1770                         _mm_store_ps(out4f, v);
1771                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1772                         _mm_store_ps(screen4f, v);
1773                         in4f += 4;
1774                         out4f += 4;
1775                         screen4f += 4;
1776                 }
1777         }
1778         else
1779         {
1780                 minpos = maxpos = _mm_load_ps(in4f);
1781                 while (out4f < end)
1782                 {
1783                         __m128 v = _mm_load_ps(in4f);
1784                         minpos = _mm_min_ps(minpos, v);
1785                         maxpos = _mm_max_ps(maxpos, v);
1786                         _mm_store_ps(out4f, v);
1787                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1788                         _mm_store_ps(screen4f, v);
1789                         in4f += 4;
1790                         out4f += 4;
1791                         screen4f += 4;
1792                 }
1793         }
1794         if (starty && endy) 
1795                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, 
1796                                         _mm_setr_ps(1.0f, 0.0f, 0.0f, 0.0f),
1797                                         _mm_setr_ps(0.0f, 1.0f, 0.0f, 0.0f),
1798                                         _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f),
1799                                         _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f));
1800         return 0;
1801 }
1802
1803 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1804 {
1805         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1806         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1807         float *end;
1808         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1809                 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1810         end = out4f + numitems*4;
1811         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1812         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1813         m0 = _mm_loadu_ps(inmatrix16f);
1814         m1 = _mm_loadu_ps(inmatrix16f + 4);
1815         m2 = _mm_loadu_ps(inmatrix16f + 8);
1816         m3 = _mm_loadu_ps(inmatrix16f + 12);
1817         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1818         {
1819                 minpos = maxpos = _mm_loadu_ps(in4f);
1820                 while (out4f < end)
1821                 {
1822                         __m128 v = _mm_loadu_ps(in4f);
1823                         minpos = _mm_min_ps(minpos, v);
1824                         maxpos = _mm_max_ps(maxpos, v);
1825                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1826                         _mm_store_ps(out4f, v);
1827                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1828                         _mm_store_ps(screen4f, v);
1829                         in4f += 4;
1830                         out4f += 4;
1831                         screen4f += 4;
1832                 }
1833         }
1834         else
1835         {
1836                 minpos = maxpos = _mm_load_ps(in4f);
1837                 while (out4f < end)
1838                 {
1839                         __m128 v = _mm_load_ps(in4f);
1840                         minpos = _mm_min_ps(minpos, v);
1841                         maxpos = _mm_max_ps(maxpos, v);
1842                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1843                         _mm_store_ps(out4f, v);
1844                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1845                         _mm_store_ps(screen4f, v);
1846                         in4f += 4;
1847                         out4f += 4;
1848                         screen4f += 4;
1849                 }
1850         }
1851         if (starty && endy) 
1852                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, m0, m1, m2, m3); 
1853         return 0;
1854 }
1855 #endif
1856
1857 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1858 {
1859 #ifdef SSE2_PRESENT
1860         float *outf = dpsoftrast.post_array4f[outarray];
1861         const unsigned char *inb;
1862         int firstvertex = dpsoftrast.firstvertex;
1863         int numvertices = dpsoftrast.numvertices;
1864         int stride;
1865         switch(inarray)
1866         {
1867         case DPSOFTRAST_ARRAY_POSITION:
1868                 stride = dpsoftrast.stride_vertex;
1869                 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1870                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1871                 break;
1872         case DPSOFTRAST_ARRAY_COLOR:
1873                 stride = dpsoftrast.stride_color;
1874                 if (dpsoftrast.pointer_color4f)
1875                 {
1876                         inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1877                         DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1878                 }
1879                 else if (dpsoftrast.pointer_color4ub)
1880                 {
1881                         stride = dpsoftrast.stride_color;
1882                         inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1883                         DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1884                 }
1885                 else
1886                 {
1887                         DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1888                 }
1889                 break;
1890         default:
1891                 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1892                 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1893                 {
1894                         inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1895                         switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1896                         {
1897                         case 2:
1898                                 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1899                                 break;
1900                         case 3:
1901                                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1902                                 break;
1903                         case 4:
1904                                 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1905                                 break;
1906                         }
1907                 }
1908                 break;
1909         }
1910         return outf;
1911 #else
1912         return NULL;
1913 #endif
1914 }
1915
1916 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1917 {
1918         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1919         DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1920         return data;
1921 }
1922
1923 #if 0
1924 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1925 {
1926 #ifdef SSE2_PRESENT
1927         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1928         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1929         return data;
1930 #else
1931         return NULL;
1932 #endif
1933 }
1934 #endif
1935
1936 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1937 {
1938 #ifdef SSE2_PRESENT
1939         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1940         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
1941         return data;
1942 #else
1943         return NULL;
1944 #endif
1945 }
1946
1947 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1948 {
1949         int x;
1950         int startx = span->startx;
1951         int endx = span->endx;
1952         float wslope = triangle->w[0];
1953         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1954         float endz = 1.0f / (w + wslope * startx);
1955         for (x = startx;x < endx;)
1956         {
1957                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1958                 float z = endz, dz;
1959                 if (nextsub >= endx) nextsub = endsub = endx-1;
1960                 endz = 1.0f / (w + wslope * nextsub);
1961                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1962                 for (; x <= endsub; x++, z += dz)
1963                         zf[x] = z;
1964         }
1965 }
1966
1967 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
1968 {
1969         int x;
1970         int startx = span->startx;
1971         int endx = span->endx;
1972         int d[4];
1973         float a, b;
1974         unsigned char * RESTRICT pixelmask = span->pixelmask;
1975         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1976         if (!pixel)
1977                 return;
1978         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
1979         // handle alphatest now (this affects depth writes too)
1980         if (thread->alphatest)
1981                 for (x = startx;x < endx;x++)
1982                         if (in4f[x*4+3] < 0.5f)
1983                                 pixelmask[x] = false;
1984         // FIXME: this does not handle bigendian
1985         switch(thread->fb_blendmode)
1986         {
1987         case DPSOFTRAST_BLENDMODE_OPAQUE:
1988                 for (x = startx;x < endx;x++)
1989                 {
1990                         if (!pixelmask[x])
1991                                 continue;
1992                         d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
1993                         d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
1994                         d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
1995                         d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
1996                         pixel[x*4+0] = d[0];
1997                         pixel[x*4+1] = d[1];
1998                         pixel[x*4+2] = d[2];
1999                         pixel[x*4+3] = d[3];
2000                 }
2001                 break;
2002         case DPSOFTRAST_BLENDMODE_ALPHA:
2003                 for (x = startx;x < endx;x++)
2004                 {
2005                         if (!pixelmask[x])
2006                                 continue;
2007                         a = in4f[x*4+3] * 255.0f;
2008                         b = 1.0f - in4f[x*4+3];
2009                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2010                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2011                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2012                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2013                         pixel[x*4+0] = d[0];
2014                         pixel[x*4+1] = d[1];
2015                         pixel[x*4+2] = d[2];
2016                         pixel[x*4+3] = d[3];
2017                 }
2018                 break;
2019         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2020                 for (x = startx;x < endx;x++)
2021                 {
2022                         if (!pixelmask[x])
2023                                 continue;
2024                         a = in4f[x*4+3] * 255.0f;
2025                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2026                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2027                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2028                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2029                         pixel[x*4+0] = d[0];
2030                         pixel[x*4+1] = d[1];
2031                         pixel[x*4+2] = d[2];
2032                         pixel[x*4+3] = d[3];
2033                 }
2034                 break;
2035         case DPSOFTRAST_BLENDMODE_ADD:
2036                 for (x = startx;x < endx;x++)
2037                 {
2038                         if (!pixelmask[x])
2039                                 continue;
2040                         d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2041                         d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2042                         d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2043                         d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2044                         pixel[x*4+0] = d[0];
2045                         pixel[x*4+1] = d[1];
2046                         pixel[x*4+2] = d[2];
2047                         pixel[x*4+3] = d[3];
2048                 }
2049                 break;
2050         case DPSOFTRAST_BLENDMODE_INVMOD:
2051                 for (x = startx;x < endx;x++)
2052                 {
2053                         if (!pixelmask[x])
2054                                 continue;
2055                         d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2056                         d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2057                         d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2058                         d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2059                         pixel[x*4+0] = d[0];
2060                         pixel[x*4+1] = d[1];
2061                         pixel[x*4+2] = d[2];
2062                         pixel[x*4+3] = d[3];
2063                 }
2064                 break;
2065         case DPSOFTRAST_BLENDMODE_MUL:
2066                 for (x = startx;x < endx;x++)
2067                 {
2068                         if (!pixelmask[x])
2069                                 continue;
2070                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2071                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2072                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2073                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2074                         pixel[x*4+0] = d[0];
2075                         pixel[x*4+1] = d[1];
2076                         pixel[x*4+2] = d[2];
2077                         pixel[x*4+3] = d[3];
2078                 }
2079                 break;
2080         case DPSOFTRAST_BLENDMODE_MUL2:
2081                 for (x = startx;x < endx;x++)
2082                 {
2083                         if (!pixelmask[x])
2084                                 continue;
2085                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2086                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2087                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2088                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2089                         pixel[x*4+0] = d[0];
2090                         pixel[x*4+1] = d[1];
2091                         pixel[x*4+2] = d[2];
2092                         pixel[x*4+3] = d[3];
2093                 }
2094                 break;
2095         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2096                 for (x = startx;x < endx;x++)
2097                 {
2098                         if (!pixelmask[x])
2099                                 continue;
2100                         a = in4f[x*4+3] * -255.0f;
2101                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2102                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2103                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2104                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2105                         pixel[x*4+0] = d[0];
2106                         pixel[x*4+1] = d[1];
2107                         pixel[x*4+2] = d[2];
2108                         pixel[x*4+3] = d[3];
2109                 }
2110                 break;
2111         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2112                 for (x = startx;x < endx;x++)
2113                 {
2114                         if (!pixelmask[x])
2115                                 continue;
2116                         a = 255.0f;
2117                         b = 1.0f - in4f[x*4+3];
2118                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2119                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2120                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2121                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2122                         pixel[x*4+0] = d[0];
2123                         pixel[x*4+1] = d[1];
2124                         pixel[x*4+2] = d[2];
2125                         pixel[x*4+3] = d[3];
2126                 }
2127                 break;
2128         case DPSOFTRAST_BLENDMODE_INVADD:
2129                 for (x = startx;x < endx;x++)
2130                 {
2131                         if (!pixelmask[x])
2132                                 continue;
2133                         d[0] = (int)((255.0f-pixel[x*4+2])*in4f[x*4+0] + pixel[x*4+2]);if (d[0] > 255) d[0] = 255;
2134                         d[1] = (int)((255.0f-pixel[x*4+1])*in4f[x*4+1] + pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2135                         d[2] = (int)((255.0f-pixel[x*4+0])*in4f[x*4+2] + pixel[x*4+0]);if (d[2] > 255) d[2] = 255;
2136                         d[3] = (int)((255.0f-pixel[x*4+3])*in4f[x*4+3] + pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2137                         pixel[x*4+0] = d[0];
2138                         pixel[x*4+1] = d[1];
2139                         pixel[x*4+2] = d[2];
2140                         pixel[x*4+3] = d[3];
2141                 }
2142                 break;
2143         }
2144 }
2145
2146 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2147 {
2148 #ifdef SSE2_PRESENT
2149         int x;
2150         int startx = span->startx;
2151         int endx = span->endx;
2152         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2153         unsigned char * RESTRICT pixelmask = span->pixelmask;
2154         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2155         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2156         if (!pixel)
2157                 return;
2158         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2159         pixeli += span->y * dpsoftrast.fb_width + span->x;
2160         // handle alphatest now (this affects depth writes too)
2161         if (thread->alphatest)
2162                 for (x = startx;x < endx;x++)
2163                         if (in4ub[x*4+3] < 0.5f)
2164                                 pixelmask[x] = false;
2165         // FIXME: this does not handle bigendian
2166         switch(thread->fb_blendmode)
2167         {
2168         case DPSOFTRAST_BLENDMODE_OPAQUE:
2169                 for (x = startx;x + 4 <= endx;)
2170                 {
2171                         if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2172                         {
2173                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2174                                 x += 4;
2175                         }
2176                         else
2177                         {
2178                                 if (pixelmask[x])
2179                                         pixeli[x] = ini[x];
2180                                 x++;
2181                         }
2182                 }
2183                 for (;x < endx;x++)
2184                         if (pixelmask[x])
2185                                 pixeli[x] = ini[x];
2186                 break;
2187         case DPSOFTRAST_BLENDMODE_ALPHA:
2188         #define FINISHBLEND(blend2, blend1) \
2189                 for (x = startx;x + 1 < endx;x += 2) \
2190                 { \
2191                         __m128i src, dst; \
2192                         switch (*(const unsigned short*)&pixelmask[x]) \
2193                         { \
2194                         case 0x0101: \
2195                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2196                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2197                                 blend2; \
2198                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2199                                 continue; \
2200                         case 0x0100: \
2201                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2202                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2203                                 blend1; \
2204                                 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst));  \
2205                                 continue; \
2206                         case 0x0001: \
2207                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2208                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2209                                 blend1; \
2210                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2211                                 continue; \
2212                         } \
2213                         break; \
2214                 } \
2215                 for(;x < endx; x++) \
2216                 { \
2217                         __m128i src, dst; \
2218                         if (!pixelmask[x]) \
2219                                 continue; \
2220                         src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2221                         dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2222                         blend1; \
2223                         pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2224                 }
2225
2226                 FINISHBLEND({
2227                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2228                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2229                 }, {
2230                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2231                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2232                 });
2233                 break;
2234         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2235                 FINISHBLEND({
2236                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2237                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2238                 }, {
2239                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2240                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2241                 });
2242                 break;
2243         case DPSOFTRAST_BLENDMODE_ADD:
2244                 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2245                 break;
2246         case DPSOFTRAST_BLENDMODE_INVMOD:
2247                 FINISHBLEND({
2248                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2249                 }, {
2250                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2251                 });
2252                 break;
2253         case DPSOFTRAST_BLENDMODE_MUL:
2254                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2255                 break;
2256         case DPSOFTRAST_BLENDMODE_MUL2:
2257                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2258                 break;
2259         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2260                 FINISHBLEND({
2261                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2262                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2263                 }, {
2264                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2265                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2266                 });
2267                 break;
2268         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2269                 FINISHBLEND({
2270                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2271                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2272                 }, {
2273                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2274                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2275                 });
2276                 break;
2277         case DPSOFTRAST_BLENDMODE_INVADD:
2278                 FINISHBLEND({
2279                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2280                 }, {
2281                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2282                 });
2283                 break;
2284         }
2285 #endif
2286 }
2287
2288 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2289 {
2290         int x;
2291         int startx = span->startx;
2292         int endx = span->endx;
2293         int flags;
2294         float c[4];
2295         float data[4];
2296         float slope[4];
2297         float tc[2], endtc[2];
2298         float tcscale[2];
2299         unsigned int tci[2];
2300         unsigned int tci1[2];
2301         unsigned int tcimin[2];
2302         unsigned int tcimax[2];
2303         int tciwrapmask[2];
2304         int tciwidth;
2305         int filter;
2306         int mip;
2307         const unsigned char * RESTRICT pixelbase;
2308         const unsigned char * RESTRICT pixel[4];
2309         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2310         // if no texture is bound, just fill it with white
2311         if (!texture)
2312         {
2313                 for (x = startx;x < endx;x++)
2314                 {
2315                         out4f[x*4+0] = 1.0f;
2316                         out4f[x*4+1] = 1.0f;
2317                         out4f[x*4+2] = 1.0f;
2318                         out4f[x*4+3] = 1.0f;
2319                 }
2320                 return;
2321         }
2322         mip = triangle->mip[texunitindex];
2323         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2324         // if this mipmap of the texture is 1 pixel, just fill it with that color
2325         if (texture->mipmap[mip][1] == 4)
2326         {
2327                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2328                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2329                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2330                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2331                 for (x = startx;x < endx;x++)
2332                 {
2333                         out4f[x*4+0] = c[0];
2334                         out4f[x*4+1] = c[1];
2335                         out4f[x*4+2] = c[2];
2336                         out4f[x*4+3] = c[3];
2337                 }
2338                 return;
2339         }
2340         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2341         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2342         flags = texture->flags;
2343         tcscale[0] = texture->mipmap[mip][2];
2344         tcscale[1] = texture->mipmap[mip][3];
2345         tciwidth = texture->mipmap[mip][2];
2346         tcimin[0] = 0;
2347         tcimin[1] = 0;
2348         tcimax[0] = texture->mipmap[mip][2]-1;
2349         tcimax[1] = texture->mipmap[mip][3]-1;
2350         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2351         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2352         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
2353         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
2354         for (x = startx;x < endx;)
2355         {
2356                 unsigned int subtc[2];
2357                 unsigned int substep[2];
2358                 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2359                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2360                 if (nextsub >= endx)
2361                 {
2362                         nextsub = endsub = endx-1;      
2363                         if (x < nextsub) subscale = 65536.0f / (nextsub - x);
2364                 }
2365                 tc[0] = endtc[0];
2366                 tc[1] = endtc[1];
2367                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
2368                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
2369                 substep[0] = (endtc[0] - tc[0]) * subscale;
2370                 substep[1] = (endtc[1] - tc[1]) * subscale;
2371                 subtc[0] = tc[0] * (1<<16);
2372                 subtc[1] = tc[1] * (1<<16);
2373                 if (filter)
2374                 {
2375                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2376                         {
2377                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2378                                 {
2379                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2380                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2381                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2382                                         tci[0] = subtc[0]>>16;
2383                                         tci[1] = subtc[1]>>16;
2384                                         tci1[0] = tci[0] + 1;
2385                                         tci1[1] = tci[1] + 1;
2386                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2387                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2388                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2389                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2390                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2391                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2392                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2393                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2394                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2395                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2396                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2397                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2398                                         out4f[x*4+0] = c[0];
2399                                         out4f[x*4+1] = c[1];
2400                                         out4f[x*4+2] = c[2];
2401                                         out4f[x*4+3] = c[3];
2402                                 }
2403                         }
2404                         else
2405                         {
2406                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2407                                 {
2408                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2409                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2410                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2411                                         tci[0] = subtc[0]>>16;
2412                                         tci[1] = subtc[1]>>16;
2413                                         tci1[0] = tci[0] + 1;
2414                                         tci1[1] = tci[1] + 1;
2415                                         tci[0] &= tciwrapmask[0];
2416                                         tci[1] &= tciwrapmask[1];
2417                                         tci1[0] &= tciwrapmask[0];
2418                                         tci1[1] &= tciwrapmask[1];
2419                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2420                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2421                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2422                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2423                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2424                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2425                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2426                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2427                                         out4f[x*4+0] = c[0];
2428                                         out4f[x*4+1] = c[1];
2429                                         out4f[x*4+2] = c[2];
2430                                         out4f[x*4+3] = c[3];
2431                                 }
2432                         }
2433                 }
2434                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2435                 {
2436                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2437                         {
2438                                 tci[0] = subtc[0]>>16;
2439                                 tci[1] = subtc[1]>>16;
2440                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2441                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2442                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2443                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2444                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2445                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2446                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2447                                 out4f[x*4+0] = c[0];
2448                                 out4f[x*4+1] = c[1];
2449                                 out4f[x*4+2] = c[2];
2450                                 out4f[x*4+3] = c[3];
2451                         }
2452                 }
2453                 else
2454                 {
2455                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2456                         {
2457                                 tci[0] = subtc[0]>>16;
2458                                 tci[1] = subtc[1]>>16;
2459                                 tci[0] &= tciwrapmask[0];
2460                                 tci[1] &= tciwrapmask[1];
2461                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2462                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2463                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2464                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2465                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2466                                 out4f[x*4+0] = c[0];
2467                                 out4f[x*4+1] = c[1];
2468                                 out4f[x*4+2] = c[2];
2469                                 out4f[x*4+3] = c[3];
2470                         }
2471                 }
2472         }
2473 }
2474
2475 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2476 {
2477 #ifdef SSE2_PRESENT
2478         int x;
2479         int startx = span->startx;
2480         int endx = span->endx;
2481         int flags;
2482         __m128 data, slope, tcscale;
2483         __m128i tcsize, tcmask, tcoffset, tcmax;
2484         __m128 tc, endtc;
2485         __m128i subtc, substep, endsubtc;
2486         int filter;
2487         int mip;
2488         unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2489         const unsigned char * RESTRICT pixelbase;
2490         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2491         // if no texture is bound, just fill it with white
2492         if (!texture)
2493         {
2494                 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2495                 return;
2496         }
2497         mip = triangle->mip[texunitindex];
2498         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2499         // if this mipmap of the texture is 1 pixel, just fill it with that color
2500         if (texture->mipmap[mip][1] == 4)
2501         {
2502                 unsigned int k = *((const unsigned int *)pixelbase);
2503                 for (x = startx;x < endx;x++)
2504                         outi[x] = k;
2505                 return;
2506         }
2507         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2508         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2509         flags = texture->flags;
2510         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2511         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2512         tcscale = _mm_cvtepi32_ps(tcsize);
2513         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2514         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2515         endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
2516         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2517         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2518         tcmax = _mm_packs_epi32(tcmask, tcmask);
2519         for (x = startx;x < endx;)
2520         {
2521                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2522                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2523                 if (nextsub >= endx)
2524                 {
2525                         nextsub = endsub = endx-1;
2526                         if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2527                 }       
2528                 tc = endtc;
2529                 subtc = endsubtc;
2530                 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
2531                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2532                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2533                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2534                 substep = _mm_slli_epi32(substep, 1);
2535                 if (filter)
2536                 {
2537                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2538                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2539                         {
2540                                 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2541                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2542                                 {
2543                                         const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;                   
2544                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2545                                         tci = _mm_madd_epi16(tci, tcoffset);
2546                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2547                                         ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2548                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2549                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2550                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2551                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2552                                         fracm = _mm_srli_epi16(subtc, 1);
2553                                         pix1 = _mm_add_epi16(pix1,
2554                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2555                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2556                                         pix3 = _mm_add_epi16(pix3,
2557                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2558                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2559                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2560                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2561                                         pix2 = _mm_add_epi16(pix2,
2562                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2563                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2564                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2565                                 }
2566                                 if (x <= endsub)
2567                                 {
2568                                         const unsigned char * RESTRICT ptr1;
2569                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2570                                         tci = _mm_madd_epi16(tci, tcoffset);
2571                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2572                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2573                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2574                                         fracm = _mm_srli_epi16(subtc, 1);
2575                                         pix1 = _mm_add_epi16(pix1,
2576                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2577                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2578                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2579                                         pix1 = _mm_add_epi16(pix1,
2580                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2581                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2582                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2583                                         x++;
2584                                 }
2585                         }
2586                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2587                         {
2588                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2589                                 {
2590                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2591                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2592                                         tci = _mm_madd_epi16(tci, tcoffset);
2593                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2594                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2595                                                                                         _mm_setzero_si128());
2596                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2597                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2598                                                                                         _mm_setzero_si128());
2599                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2600                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2601                                         tci = _mm_madd_epi16(tci, tcoffset);
2602                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2603                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2604                                                                                         _mm_setzero_si128());
2605                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2606                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2607                                                                                         _mm_setzero_si128());
2608                                         fracm = _mm_srli_epi16(subtc, 1);
2609                                         pix1 = _mm_add_epi16(pix1,
2610                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2611                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2612                                         pix3 = _mm_add_epi16(pix3,
2613                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2614                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2615                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2616                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2617                                         pix2 = _mm_add_epi16(pix2,
2618                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2619                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2620                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2621                                 }
2622                                 if (x <= endsub)
2623                                 {
2624                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2625                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2626                                         tci = _mm_madd_epi16(tci, tcoffset);
2627                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2628                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2629                                                                                         _mm_setzero_si128());
2630                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2631                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2632                                                                                         _mm_setzero_si128());
2633                                         fracm = _mm_srli_epi16(subtc, 1);
2634                                         pix1 = _mm_add_epi16(pix1,
2635                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2636                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2637                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2638                                         pix1 = _mm_add_epi16(pix1,
2639                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2640                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2641                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2642                                         x++;
2643                                 }
2644                         }
2645                         else
2646                         {
2647                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2648                                 {
2649                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2650                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2651                                         tci = _mm_madd_epi16(tci, tcoffset);
2652                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2653                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2654                                                                                         _mm_setzero_si128());
2655                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2656                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2657                                                                                         _mm_setzero_si128());
2658                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2659                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2660                                         tci = _mm_madd_epi16(tci, tcoffset);
2661                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2662                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2663                                                                                         _mm_setzero_si128());
2664                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2665                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2666                                                                                         _mm_setzero_si128());
2667                                         fracm = _mm_srli_epi16(subtc, 1);
2668                                         pix1 = _mm_add_epi16(pix1,
2669                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2670                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2671                                         pix3 = _mm_add_epi16(pix3,
2672                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2673                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2674                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2675                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2676                                         pix2 = _mm_add_epi16(pix2,
2677                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2678                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2679                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2680                                 }
2681                                 if (x <= endsub)
2682                                 {
2683                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2684                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2685                                         tci = _mm_madd_epi16(tci, tcoffset);
2686                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2687                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2688                                                                                         _mm_setzero_si128());
2689                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2690                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2691                                                                                         _mm_setzero_si128());
2692                                         fracm = _mm_srli_epi16(subtc, 1);
2693                                         pix1 = _mm_add_epi16(pix1,
2694                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2695                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2696                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2697                                         pix1 = _mm_add_epi16(pix1,
2698                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2699                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2700                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2701                                         x++;
2702                                 }
2703                         }
2704                 }
2705                 else
2706                 {
2707                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2708                         {
2709                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2710                                 {
2711                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2712                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2713                                         tci = _mm_madd_epi16(tci, tcoffset);
2714                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2715                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2716                                 }
2717                                 if (x <= endsub)
2718                                 {
2719                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2720                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2721                                         tci = _mm_madd_epi16(tci, tcoffset);
2722                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2723                                         x++;
2724                                 }
2725                         }
2726                         else
2727                         {
2728                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2729                                 {
2730                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2731                                         tci = _mm_and_si128(tci, tcmax); 
2732                                         tci = _mm_madd_epi16(tci, tcoffset);
2733                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2734                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2735                                 }
2736                                 if (x <= endsub)
2737                                 {
2738                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2739                                         tci = _mm_and_si128(tci, tcmax); 
2740                                         tci = _mm_madd_epi16(tci, tcoffset);
2741                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2742                                         x++;
2743                                 }
2744                         }
2745                 }
2746         }
2747 #endif
2748 }
2749
2750 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2751 {
2752         // TODO: IMPLEMENT
2753         memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2754 }
2755
2756 float DPSOFTRAST_SampleShadowmap(const float *vector)
2757 {
2758         // TODO: IMPLEMENT
2759         return 1.0f;
2760 }
2761
2762 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2763 {
2764         int x;
2765         int startx = span->startx;
2766         int endx = span->endx;
2767         float c[4];
2768         float data[4];
2769         float slope[4];
2770         float z;
2771         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2772         for (x = startx;x < endx;x++)
2773         {
2774                 z = zf[x];
2775                 c[0] = (data[0] + slope[0]*x) * z;
2776                 c[1] = (data[1] + slope[1]*x) * z;
2777                 c[2] = (data[2] + slope[2]*x) * z;
2778                 c[3] = (data[3] + slope[3]*x) * z;
2779                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2780                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2781                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2782                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2783         }
2784 }
2785
2786 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2787 {
2788         int x;
2789         int startx = span->startx;
2790         int endx = span->endx;
2791         float c[4];
2792         float data[4];
2793         float slope[4];
2794         float z;
2795         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2796         for (x = startx;x < endx;x++)
2797         {
2798                 z = zf[x];
2799                 c[0] = (data[0] + slope[0]*x) * z;
2800                 c[1] = (data[1] + slope[1]*x) * z;
2801                 c[2] = (data[2] + slope[2]*x) * z;
2802                 c[3] = (data[3] + slope[3]*x) * z;
2803                 out4f[x*4+0] = c[0];
2804                 out4f[x*4+1] = c[1];
2805                 out4f[x*4+2] = c[2];
2806                 out4f[x*4+3] = c[3];
2807         }
2808 }
2809
2810 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2811 {
2812         int x, startx = span->startx, endx = span->endx;
2813         float c[4], localcolor[4];
2814         localcolor[0] = subcolor[0];
2815         localcolor[1] = subcolor[1];
2816         localcolor[2] = subcolor[2];
2817         localcolor[3] = subcolor[3];
2818         for (x = startx;x < endx;x++)
2819         {
2820                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2821                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2822                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2823                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2824                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2825                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2826                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2827                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2828         }
2829 }
2830
2831 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2832 {
2833         int x, startx = span->startx, endx = span->endx;
2834         for (x = startx;x < endx;x++)
2835         {
2836                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2837                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2838                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2839                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2840         }
2841 }
2842
2843 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2844 {
2845         int x, startx = span->startx, endx = span->endx;
2846         for (x = startx;x < endx;x++)
2847         {
2848                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2849                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2850                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2851                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2852         }
2853 }
2854
2855 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2856 {
2857         int x, startx = span->startx, endx = span->endx;
2858         float a, b;
2859         for (x = startx;x < endx;x++)
2860         {
2861                 a = 1.0f - inb4f[x*4+3];
2862                 b = inb4f[x*4+3];
2863                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2864                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2865                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2866                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2867         }
2868 }
2869
2870 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2871 {
2872         int x, startx = span->startx, endx = span->endx;
2873         float localcolor[4], ilerp, lerp;
2874         localcolor[0] = color[0];
2875         localcolor[1] = color[1];
2876         localcolor[2] = color[2];
2877         localcolor[3] = color[3];
2878         ilerp = 1.0f - localcolor[3];
2879         lerp = localcolor[3];
2880         for (x = startx;x < endx;x++)
2881         {
2882                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2883                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2884                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2885                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2886         }
2887 }
2888
2889
2890
2891 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2892 {
2893 #ifdef SSE2_PRESENT
2894         int x;
2895         int startx = span->startx;
2896         int endx = span->endx;
2897         __m128 data, slope;
2898         __m128 mod, endmod;
2899         __m128i submod, substep, endsubmod;
2900         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2901         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2902         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2903         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2904         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2905         for (x = startx; x < endx;)
2906         {
2907                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2908                 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2909                 if (nextsub >= endx)
2910                 {
2911                         nextsub = endsub = endx-1;
2912                         if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2913                 }
2914                 mod = endmod;
2915                 submod = endsubmod;
2916                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2917                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2918                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2919                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2920                 substep = _mm_packs_epi32(substep, substep);
2921                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2922                 {
2923                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2924                         pix = _mm_mulhi_epu16(pix, submod);
2925                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2926                 }
2927                 if (x <= endsub)
2928                 {
2929                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2930                         pix = _mm_mulhi_epu16(pix, submod);
2931                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2932                         x++;
2933                 }
2934         }
2935 #endif
2936 }
2937
2938 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2939 {
2940 #ifdef SSE2_PRESENT
2941         int x;
2942         int startx = span->startx;
2943         int endx = span->endx;
2944         __m128 data, slope;
2945         __m128 mod, endmod;
2946         __m128i submod, substep, endsubmod;
2947         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2948         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2949         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2950         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2951         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2952         for (x = startx; x < endx;)
2953         {
2954                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2955                 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2956                 if (nextsub >= endx)
2957                 {
2958                         nextsub = endsub = endx-1;
2959                         if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
2960                 }
2961                 mod = endmod;
2962                 submod = endsubmod;
2963                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2964                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2965                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2966                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2967                 substep = _mm_packs_epi32(substep, substep);
2968                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2969                 {
2970                         __m128i pix = _mm_srai_epi16(submod, 4);
2971                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2972                 }
2973                 if (x <= endsub)
2974                 {
2975                         __m128i pix = _mm_srai_epi16(submod, 4);
2976                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2977                         x++;
2978                 }
2979         }
2980 #endif
2981 }
2982
2983 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
2984 {
2985 #ifdef SSE2_PRESENT
2986         int x, startx = span->startx, endx = span->endx;
2987         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
2988         localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
2989         for (x = startx;x+2 <= endx;x+=2)
2990         {
2991                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2992                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2993                 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2994                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2995         }
2996         if (x < endx)
2997         {
2998                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2999                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3000                 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
3001                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3002         }
3003 #endif
3004 }
3005
3006 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3007 {
3008 #ifdef SSE2_PRESENT
3009         int x, startx = span->startx, endx = span->endx;
3010         for (x = startx;x+2 <= endx;x+=2)
3011         {
3012                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3013                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3014                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3015                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3016         }
3017         if (x < endx)
3018         {
3019                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3020                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3021                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3022                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3023         }
3024 #endif
3025 }
3026
3027 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3028 {
3029 #ifdef SSE2_PRESENT
3030         int x, startx = span->startx, endx = span->endx;
3031         for (x = startx;x+2 <= endx;x+=2)
3032         {
3033                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3034                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3035                 pix1 = _mm_add_epi16(pix1, pix2);
3036                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3037         }
3038         if (x < endx)
3039         {
3040                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3041                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3042                 pix1 = _mm_add_epi16(pix1, pix2);
3043                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3044         }
3045 #endif
3046 }
3047
3048 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3049 {
3050 #ifdef SSE2_PRESENT
3051         int x, startx = span->startx, endx = span->endx;
3052         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3053         tint = _mm_shuffle_epi32(_mm_packs_epi32(tint, tint), _MM_SHUFFLE(1, 0, 1, 0));
3054         for (x = startx;x+2 <= endx;x+=2)
3055         {
3056                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3057                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3058                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3059                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3060         }
3061         if (x < endx)
3062         {
3063                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3064                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3065                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3066                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3067         }
3068 #endif
3069 }
3070
3071 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3072 {
3073 #ifdef SSE2_PRESENT
3074         int x, startx = span->startx, endx = span->endx;
3075         for (x = startx;x+2 <= endx;x+=2)
3076         {
3077                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3078                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3079                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3080                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3081                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3082         }
3083         if (x < endx)
3084         {
3085                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3086                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3087                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3088                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3089                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3090         }
3091 #endif
3092 }
3093
3094 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3095 {
3096 #ifdef SSE2_PRESENT
3097         int x, startx = span->startx, endx = span->endx;
3098         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3099         localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
3100         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3101         for (x = startx;x+2 <= endx;x+=2)
3102         {
3103                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3104                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3105                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3106         }
3107         if (x < endx)
3108         {
3109                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3110                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3111                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3112         }
3113 #endif
3114 }
3115
3116
3117
3118 void DPSOFTRAST_VertexShader_Generic(void)
3119 {
3120         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3121         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3122         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3123         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3124                 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3125 }
3126
3127 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3128 {
3129         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3130         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3131         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3132         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3133         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3134         if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3135         {
3136                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3137                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3138                 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3139                 {
3140                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3141                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3142                         {
3143                                 // multiply
3144                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3145                         }
3146                         else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3147                         {
3148                                 // add
3149                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3150                         }
3151                         else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3152                         {
3153                                 // alphablend
3154                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3155                         }
3156                 }
3157         }
3158         else
3159                 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3160         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3161 }
3162
3163
3164
3165 void DPSOFTRAST_VertexShader_PostProcess(void)
3166 {
3167         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3168         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3169         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3170 }
3171
3172 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3173 {
3174         // TODO: optimize!!  at the very least there is no reason to use texture sampling on the frame texture
3175         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3176         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3177         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3178         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3179         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3180         if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3181         {
3182                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3183                 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3184         }
3185         DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3186         if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3187         {
3188                 // TODO: implement saturation
3189         }
3190         if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3191         {
3192                 // TODO: implement gammaramps
3193         }
3194         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3195 }
3196
3197
3198
3199 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3200 {
3201         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3202 }
3203
3204 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3205 {
3206         // this is never called (because colormask is off when this shader is used)
3207         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3208         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3209         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3210         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3211         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3212 }
3213
3214
3215
3216 void DPSOFTRAST_VertexShader_FlatColor(void)
3217 {
3218         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3219         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3220 }
3221
3222 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3223 {
3224 #ifdef SSE2_PRESENT
3225         unsigned char * RESTRICT pixelmask = span->pixelmask;
3226         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3227         int x, startx = span->startx, endx = span->endx;
3228         __m128i Color_Ambientm;
3229         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3230         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3231         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3232         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3233         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3234         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3235                 pixel = buffer_FragColorbgra8;
3236         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3237         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3238         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3239         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3240         for (x = startx;x < endx;x++)
3241         {
3242                 __m128i color, pix;
3243                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3244                 {
3245                         __m128i pix2;
3246                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3247                         pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3248                         pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3249                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3250                         x += 3;
3251                         continue;
3252                 }
3253                 if (!pixelmask[x])
3254                         continue;
3255                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3256                 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3257                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3258         }
3259         if (pixel == buffer_FragColorbgra8)
3260                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3261 #endif
3262 }
3263
3264
3265
3266 void DPSOFTRAST_VertexShader_VertexColor(void)
3267 {
3268         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3269         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3270         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3271 }
3272
3273 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3274 {
3275 #ifdef SSE2_PRESENT
3276         unsigned char * RESTRICT pixelmask = span->pixelmask;
3277         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3278         int x, startx = span->startx, endx = span->endx;
3279         __m128i Color_Ambientm, Color_Diffusem;
3280         __m128 data, slope;
3281         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3282         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3283         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3284         int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3285         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3286         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3287         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3288                 pixel = buffer_FragColorbgra8;
3289         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3290         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3291         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3292         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3293         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3294         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3295         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3296         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3297         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3298         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3299         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3300         data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3301         slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3302         for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3303         {
3304                 __m128i color, mod, pix;
3305                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3306                 {
3307                         __m128i pix2, mod2;
3308                         __m128 z = _mm_loadu_ps(&buffer_z[x]);
3309                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3310                         mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3311                         data = _mm_add_ps(data, slope);
3312                         mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3313                         data = _mm_add_ps(data, slope);
3314                         mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3315                         data = _mm_add_ps(data, slope);
3316                         mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3317                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3318                                                                   _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3319                         pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3320                                                                    _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3321                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3322                         x += 3;
3323                         continue;
3324                 }
3325                 if (!pixelmask[x])
3326                         continue;
3327                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3328                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
3329                 mod = _mm_packs_epi32(mod, mod);
3330                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3331                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3332         }
3333         if (pixel == buffer_FragColorbgra8)
3334                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3335 #endif
3336 }
3337
3338
3339
3340 void DPSOFTRAST_VertexShader_Lightmap(void)
3341 {
3342         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3343         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3344         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3345 }
3346
3347 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3348 {
3349 #ifdef SSE2_PRESENT
3350         unsigned char * RESTRICT pixelmask = span->pixelmask;
3351         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3352         int x, startx = span->startx, endx = span->endx;
3353         __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3354         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3355         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3356         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3357         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3358         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3359         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3360         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3361         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3362         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3363                 pixel = buffer_FragColorbgra8;
3364         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3365         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3366         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3367         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3368         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3369         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3370         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3371         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3372         {
3373                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3374                 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3375                 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3376                 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3377                 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3378                 for (x = startx;x < endx;x++)
3379                 {
3380                         __m128i color, lightmap, glow, pix;
3381                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3382                         {
3383                                 __m128i pix2;
3384                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3385                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3386                                 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3387                                 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3388                                                                                                         _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3389                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3390                                 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3391                                                                                                         _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3392                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3393                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3394                                 x += 3;
3395                                 continue;
3396                         }
3397                         if (!pixelmask[x])
3398                                 continue;
3399                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3400                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3401                         glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3402                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3403                         pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3404                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3405                 }
3406         }
3407         else
3408         {
3409                 for (x = startx;x < endx;x++)
3410                 {
3411                         __m128i color, lightmap, pix;
3412                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3413                         {
3414                                 __m128i pix2;
3415                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3416                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3417                                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3418                                                                           _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3419                                 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3420                                                                            _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3421                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3422                                 x += 3;
3423                                 continue;
3424                         }
3425                         if (!pixelmask[x]) 
3426                                 continue;
3427                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3428                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3429                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3430                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3431                 }
3432         }
3433         if (pixel == buffer_FragColorbgra8)
3434                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3435 #endif
3436 }
3437
3438
3439
3440 void DPSOFTRAST_VertexShader_FakeLight(void)
3441 {
3442         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3443 }
3444
3445 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3446 {
3447         // TODO: IMPLEMENT
3448         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3449         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3450         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3451         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3452         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3453 }
3454
3455
3456
3457 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3458 {
3459         DPSOFTRAST_VertexShader_Lightmap();
3460 }
3461
3462 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3463 {
3464         DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3465         // TODO: IMPLEMENT
3466 }
3467
3468
3469
3470 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3471 {
3472         DPSOFTRAST_VertexShader_Lightmap();
3473 }
3474
3475 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3476 {
3477         DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3478         // TODO: IMPLEMENT
3479 }
3480
3481
3482
3483 void DPSOFTRAST_VertexShader_LightDirection(void)
3484 {
3485         int i;
3486         int numvertices = dpsoftrast.numvertices;
3487         float LightDir[4];
3488         float LightVector[4];
3489         float EyePosition[4];
3490         float EyeVectorModelSpace[4];
3491         float EyeVector[4];
3492         float position[4];
3493         float svector[4];
3494         float tvector[4];
3495         float normal[4];
3496         LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3497         LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3498         LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3499         LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3500         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3501         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3502         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3503         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3504         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3505         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3506         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3507         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3508         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3509         for (i = 0;i < numvertices;i++)
3510         {
3511                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3512                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3513                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3514                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3515                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3516                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3517                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3518                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3519                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3520                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3521                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3522                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3523                 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3524                 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3525                 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3526                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3527                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3528                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3529                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3530                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3531                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3532                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3533                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3534                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3535                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3536                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3537                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3538                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3539                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3540         }
3541         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3542 }
3543
3544 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3545 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3546 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3547 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3548 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3549 #define DPSOFTRAST_Vector3Normalize(v)\
3550 do\
3551 {\
3552         float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3553         if (len)\
3554         {\
3555                 len = 1.0f / len;\
3556                 v[0] *= len;\
3557                 v[1] *= len;\
3558                 v[2] *= len;\
3559         }\
3560 }\
3561 while(0)
3562
3563 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3564 {
3565         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3566         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3567         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3568         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3569         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3570         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3571         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3572         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3573         int x, startx = span->startx, endx = span->endx;
3574         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3575         float LightVectordata[4];
3576         float LightVectorslope[4];
3577         float EyeVectordata[4];
3578         float EyeVectorslope[4];
3579         float z;
3580         float diffusetex[4];
3581         float glosstex[4];
3582         float surfacenormal[4];
3583         float lightnormal[4];
3584         float eyenormal[4];
3585         float specularnormal[4];
3586         float diffuse;
3587         float specular;
3588         float SpecularPower;
3589         int d[4];
3590         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3591         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3592         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3593         Color_Glow[3] = 0.0f;
3594         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3595         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3596         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3597         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3598         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3599         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3600         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3601         Color_Pants[3] = 0.0f;
3602         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3603         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3604         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3605         Color_Shirt[3] = 0.0f;
3606         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3607         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3608         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3609         {
3610                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3611                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3612         }
3613         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3614         {
3615                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3616         }
3617         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3618         {
3619                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3620                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3621                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3622                 Color_Diffuse[3] = 0.0f;
3623                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3624                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3625                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3626                 LightColor[3] = 0.0f;
3627                 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3628                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3629                 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3630                 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3631                 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3632                 Color_Specular[3] = 0.0f;
3633                 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3634                 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3635                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3636                 for (x = startx;x < endx;x++)
3637                 {
3638                         z = buffer_z[x];
3639                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3640                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3641                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3642                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3643                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3644                         {
3645                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3646                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3647                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3648                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3649                         }
3650                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3651                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3652                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3653                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3654                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3655                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3656                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3657                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3658
3659                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3660                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3661                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3662                         DPSOFTRAST_Vector3Normalize(lightnormal);
3663
3664                         eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3665                         eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3666                         eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3667                         DPSOFTRAST_Vector3Normalize(eyenormal);
3668
3669                         specularnormal[0] = lightnormal[0] + eyenormal[0];
3670                         specularnormal[1] = lightnormal[1] + eyenormal[1];
3671                         specularnormal[2] = lightnormal[2] + eyenormal[2];
3672                         DPSOFTRAST_Vector3Normalize(specularnormal);
3673
3674                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3675                         specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3676                         specular = pow(specular, SpecularPower * glosstex[3]);
3677                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3678                         {
3679                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3680                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3681                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3682                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3683                         }
3684                         else
3685                         {
3686                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3687                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3688                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3689                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3690                         }
3691                         buffer_FragColorbgra8[x*4+0] = d[0];
3692                         buffer_FragColorbgra8[x*4+1] = d[1];
3693                         buffer_FragColorbgra8[x*4+2] = d[2];
3694                         buffer_FragColorbgra8[x*4+3] = d[3];
3695                 }
3696         }
3697         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3698         {
3699                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3700                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3701                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3702                 Color_Diffuse[3] = 0.0f;
3703                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3704                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3705                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3706                 LightColor[3] = 0.0f;
3707                 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3708                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3709                 for (x = startx;x < endx;x++)
3710                 {
3711                         z = buffer_z[x];
3712                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3713                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3714                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3715                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3716                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3717                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3718                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3719                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3720
3721                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3722                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3723                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3724                         DPSOFTRAST_Vector3Normalize(lightnormal);
3725
3726                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3727                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3728                         {
3729                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3730                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3731                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3732                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3733                         }
3734                         else
3735                         {
3736                                 d[0] = (int)(                                                + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3737                                 d[1] = (int)(                                                + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3738                                 d[2] = (int)(                                                + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3739                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3740                         }
3741                         buffer_FragColorbgra8[x*4+0] = d[0];
3742                         buffer_FragColorbgra8[x*4+1] = d[1];
3743                         buffer_FragColorbgra8[x*4+2] = d[2];
3744                         buffer_FragColorbgra8[x*4+3] = d[3];
3745                 }
3746         }
3747         else
3748         {
3749                 for (x = startx;x < endx;x++)
3750                 {
3751                         z = buffer_z[x];
3752                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3753                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3754                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3755                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3756
3757                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3758                         {
3759                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3760                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3761                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3762                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3763                         }
3764                         else
3765                         {
3766                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3767                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3768                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3769                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3770                         }
3771                         buffer_FragColorbgra8[x*4+0] = d[0];
3772                         buffer_FragColorbgra8[x*4+1] = d[1];
3773                         buffer_FragColorbgra8[x*4+2] = d[2];
3774                         buffer_FragColorbgra8[x*4+3] = d[3];
3775                 }
3776         }
3777         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3778 }
3779
3780
3781
3782 void DPSOFTRAST_VertexShader_LightSource(void)
3783 {
3784         int i;
3785         int numvertices = dpsoftrast.numvertices;
3786         float LightPosition[4];
3787         float LightVector[4];
3788         float LightVectorModelSpace[4];
3789         float EyePosition[4];
3790         float EyeVectorModelSpace[4];
3791         float EyeVector[4];
3792         float position[4];
3793         float svector[4];
3794         float tvector[4];
3795         float normal[4];
3796         LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
3797         LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
3798         LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
3799         LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
3800         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3801         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3802         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3803         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3804         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3805         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3806         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3807         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3808         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3809         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3810         for (i = 0;i < numvertices;i++)
3811         {
3812                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3813                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3814                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3815                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3816                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3817                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3818                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3819                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3820                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3821                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3822                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3823                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3824                 LightVectorModelSpace[0] = LightPosition[0] - position[0];
3825                 LightVectorModelSpace[1] = LightPosition[1] - position[1];
3826                 LightVectorModelSpace[2] = LightPosition[2] - position[2];
3827                 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
3828                 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
3829                 LightVector[2] = normal[0]  * LightVectorModelSpace[0] + normal[1]  * LightVectorModelSpace[1] + normal[2]  * LightVectorModelSpace[2];
3830                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3831                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3832                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3833                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3834                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3835                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3836                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3837                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3838                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3839                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3840                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3841                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3842                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3843                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3844         }
3845         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3846         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
3847 }
3848
3849 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3850 {
3851 #ifdef SSE2_PRESENT
3852         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3853         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3854         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3855         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3856         unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3857         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3858         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3859         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3860         int x, startx = span->startx, endx = span->endx;
3861         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3862         float CubeVectordata[4];
3863         float CubeVectorslope[4];
3864         float LightVectordata[4];
3865         float LightVectorslope[4];
3866         float EyeVectordata[4];
3867         float EyeVectorslope[4];
3868         float z;
3869         float diffusetex[4];
3870         float glosstex[4];
3871         float surfacenormal[4];
3872         float lightnormal[4];
3873         float eyenormal[4];
3874         float specularnormal[4];
3875         float diffuse;
3876         float specular;
3877         float SpecularPower;
3878         float CubeVector[4];
3879         float attenuation;
3880         int d[4];
3881         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3882         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3883         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3884         Color_Glow[3] = 0.0f;
3885         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3886         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3887         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3888         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3889         Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3890         Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3891         Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3892         Color_Diffuse[3] = 0.0f;
3893         Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3894         Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3895         Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3896         Color_Specular[3] = 0.0f;
3897         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3898         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3899         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3900         Color_Pants[3] = 0.0f;
3901         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3902         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3903         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3904         Color_Shirt[3] = 0.0f;
3905         LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3906         LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3907         LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3908         LightColor[3] = 0.0f;
3909         SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3910         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3911         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3912         DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3913         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3914         memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
3915         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3916         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3917         {
3918                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3919                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3920         }
3921         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3922                 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
3923         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3924         {
3925                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3926                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3927                 for (x = startx;x < endx;x++)
3928                 {
3929                         z = buffer_z[x];
3930                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3931                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3932                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3933                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3934                         if (attenuation < 0.01f)
3935                                 continue;
3936                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3937                         {
3938                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3939                                 if (attenuation < 0.01f)
3940                                         continue;
3941                         }
3942
3943                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3944                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3945                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3946                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3947                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3948                         {
3949                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3950                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3951                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3952                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3953                         }
3954                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3955                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3956                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3957                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3958                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3959                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3960                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3961                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3962
3963                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3964                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3965                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3966                         DPSOFTRAST_Vector3Normalize(lightnormal);
3967
3968                         eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3969                         eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3970                         eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3971                         DPSOFTRAST_Vector3Normalize(eyenormal);
3972
3973                         specularnormal[0] = lightnormal[0] + eyenormal[0];
3974                         specularnormal[1] = lightnormal[1] + eyenormal[1];
3975                         specularnormal[2] = lightnormal[2] + eyenormal[2];
3976                         DPSOFTRAST_Vector3Normalize(specularnormal);
3977
3978                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3979                         specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3980                         specular = pow(specular, SpecularPower * glosstex[3]);
3981                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3982                         {
3983                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3984                                 attenuation *= (1.0f / 255.0f);
3985                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3986                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3987                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3988                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
3989                         }
3990                         else
3991                         {
3992                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
3993                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
3994                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
3995                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
3996                         }
3997                         buffer_FragColorbgra8[x*4+0] = d[0];
3998                         buffer_FragColorbgra8[x*4+1] = d[1];
3999                         buffer_FragColorbgra8[x*4+2] = d[2];
4000                         buffer_FragColorbgra8[x*4+3] = d[3];
4001                 }
4002         }
4003         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4004         {
4005                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4006                 for (x = startx;x < endx;x++)
4007                 {
4008                         z = buffer_z[x];
4009                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4010                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4011                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4012                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4013                         if (attenuation < 0.01f)
4014                                 continue;
4015                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4016                         {
4017                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4018                                 if (attenuation < 0.01f)
4019                                         continue;
4020                         }
4021
4022                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4023                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4024                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4025                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4026                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4027                         {
4028                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4029                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4030                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4031                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4032                         }
4033                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4034                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4035                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4036                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4037
4038                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4039                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4040                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4041                         DPSOFTRAST_Vector3Normalize(lightnormal);
4042
4043                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4044                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4045                         {
4046                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4047                                 attenuation *= (1.0f / 255.0f);
4048                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4049                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4050                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4051                                 d[3] = (int)( diffusetex[3]                                                                                                                   );if (d[3] > 255) d[3] = 255;
4052                         }
4053                         else
4054                         {
4055                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4056                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4057                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4058                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4059                         }
4060                         buffer_FragColorbgra8[x*4+0] = d[0];
4061                         buffer_FragColorbgra8[x*4+1] = d[1];
4062                         buffer_FragColorbgra8[x*4+2] = d[2];
4063                         buffer_FragColorbgra8[x*4+3] = d[3];
4064                 }
4065         }
4066         else
4067         {
4068                 for (x = startx;x < endx;x++)
4069                 {
4070                         z = buffer_z[x];
4071                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4072                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4073                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4074                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4075                         if (attenuation < 0.01f)
4076                                 continue;
4077                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4078                         {
4079                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4080                                 if (attenuation < 0.01f)
4081                                         continue;
4082                         }
4083
4084                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4085                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4086                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4087                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4088                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4089                         {
4090                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4091                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4092                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4093                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4094                         }
4095                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4096                         {
4097                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4098                                 attenuation *= (1.0f / 255.0f);
4099                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4100                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4101                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4102                                 d[3] = (int)( diffusetex[3]                                                                                      );if (d[3] > 255) d[3] = 255;
4103                         }
4104                         else
4105                         {
4106                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4107                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4108                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4109                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4110                         }
4111                         buffer_FragColorbgra8[x*4+0] = d[0];
4112                         buffer_FragColorbgra8[x*4+1] = d[1];
4113                         buffer_FragColorbgra8[x*4+2] = d[2];
4114                         buffer_FragColorbgra8[x*4+3] = d[3];
4115                 }
4116         }
4117         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4118 #endif
4119 }
4120
4121
4122
4123 void DPSOFTRAST_VertexShader_Refraction(void)
4124 {
4125         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4126 }
4127
4128 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4129 {
4130         // TODO: IMPLEMENT
4131         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4132         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4133         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4134         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4135         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4136 }
4137
4138
4139
4140 void DPSOFTRAST_VertexShader_Water(void)
4141 {
4142         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4143 }
4144
4145
4146 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4147 {
4148         // TODO: IMPLEMENT
4149         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4150         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4151         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4152         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4153         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4154 }
4155
4156
4157
4158 void DPSOFTRAST_VertexShader_ShowDepth(void)
4159 {
4160         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4161 }
4162
4163 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4164 {
4165         // TODO: IMPLEMENT
4166         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4167         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4168         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4169         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4170         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4171 }
4172
4173
4174
4175 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4176 {
4177         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4178 }
4179
4180 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4181 {
4182         // TODO: IMPLEMENT
4183         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4184         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4185         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4186         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4187         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4188 }
4189
4190
4191
4192 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4193 {
4194         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4195 }
4196
4197 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4198 {
4199         // TODO: IMPLEMENT
4200         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4201         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4202         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4203         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4204         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4205 }
4206
4207
4208
4209 typedef struct DPSOFTRAST_ShaderModeInfo_s
4210 {
4211         int lodarrayindex;
4212         void (*Vertex)(void);
4213         void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4214         unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4215         unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4216 }
4217 DPSOFTRAST_ShaderModeInfo;
4218
4219 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4220 {
4221         {2, DPSOFTRAST_VertexShader_Generic,                        DPSOFTRAST_PixelShader_Generic,                        {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4222         {2, DPSOFTRAST_VertexShader_PostProcess,                    DPSOFTRAST_PixelShader_PostProcess,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4223         {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow,                DPSOFTRAST_PixelShader_Depth_Or_Shadow,                {~0}, {~0}},
4224         {2, DPSOFTRAST_VertexShader_FlatColor,                      DPSOFTRAST_PixelShader_FlatColor,                      {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4225         {2, DPSOFTRAST_VertexShader_VertexColor,                    DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4226         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4227         {2, DPSOFTRAST_VertexShader_FakeLight,                      DPSOFTRAST_PixelShader_FakeLight,                      {~0}, {~0}},
4228         {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace,   {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4229         {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4230         {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,                 {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4231         {2, DPSOFTRAST_VertexShader_LightSource,                    DPSOFTRAST_PixelShader_LightSource,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4232         {2, DPSOFTRAST_VertexShader_Refraction,                     DPSOFTRAST_PixelShader_Refraction,                     {~0}},
4233         {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                          {~0}},
4234         {2, DPSOFTRAST_VertexShader_ShowDepth,                      DPSOFTRAST_PixelShader_ShowDepth,                      {~0}},
4235         {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,               {~0}},
4236         {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}}
4237 };
4238
4239 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4240 {
4241         int i;
4242         int x;
4243         int startx;
4244         int endx;
4245 //      unsigned int c;
4246 //      unsigned int *colorpixel;
4247         unsigned int *depthpixel;
4248         float w;
4249         float wslope;
4250         int depth;
4251         int depthslope;
4252         unsigned int d;
4253         DPSOFTRAST_State_Triangle *triangle;
4254         DPSOFTRAST_State_Span *span;
4255         unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4256         for (i = 0; i < thread->numspans; i++)
4257         {
4258                 span = &thread->spans[i];
4259                 triangle = &thread->triangles[span->triangle];
4260                 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4261                 {
4262                         wslope = triangle->w[0];
4263                         w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4264                         depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4265                         depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4266                         depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4267                         startx = span->startx;
4268                         endx = span->endx;
4269                         switch(thread->fb_depthfunc)
4270                         {
4271                         default:
4272                         case GL_ALWAYS:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4273                         case GL_LESS:    for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4274                         case GL_LEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4275                         case GL_EQUAL:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4276                         case GL_GEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4277                         case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4278                         case GL_NEVER:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4279                         }
4280                         //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4281                         //for (x = startx;x < endx;x++)
4282                         //      colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4283                         // if there is no color buffer, skip pixel shader
4284                         while (startx < endx && !pixelmask[startx])
4285                                 startx++;
4286                         while (endx > startx && !pixelmask[endx-1])
4287                                 endx--;
4288                         if (startx >= endx)
4289                                 continue; // no pixels to fill
4290                         span->pixelmask = pixelmask;
4291                         span->startx = startx;
4292                         span->endx = endx;
4293                         // run pixel shader if appropriate
4294                         // do this before running depthmask code, to allow the pixelshader
4295                         // to clear pixelmask values for alpha testing
4296                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4297                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4298                         if (thread->depthmask)
4299                                 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4300                                         if (pixelmask[x])
4301                                                 depthpixel[x] = d;
4302                 }
4303                 else
4304                 {
4305                         // no depth testing means we're just dealing with color...
4306                         // if there is no color buffer, skip pixel shader
4307                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4308                         {
4309                                 memset(pixelmask + span->startx, 1, span->endx - span->startx);
4310                                 span->pixelmask = pixelmask;
4311                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4312                         }
4313                 }
4314         }
4315         thread->numspans = 0;
4316 }
4317
4318 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4319
4320 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4321 {
4322 #ifdef SSE2_PRESENT
4323         int cullface = thread->cullface;
4324         int minx, maxx, miny, maxy;
4325         int miny1, maxy1, miny2, maxy2;
4326         __m128i fbmin, fbmax;
4327         __m128 viewportcenter, viewportscale;
4328         int firstvertex = command->firstvertex;
4329         int numvertices = command->numvertices;
4330         int numtriangles = command->numtriangles;
4331         const int *element3i = command->element3i;
4332         const unsigned short *element3s = command->element3s;
4333         int clipped = command->clipped;
4334         int i;
4335         int j;
4336         int k;
4337         int y;
4338         int e[3];
4339         __m128i screeny;
4340         int starty, endy, bandy;
4341         int numpoints;
4342         int clipcase;
4343         float clipdist[4];
4344         __m128 triangleedge1, triangleedge2, trianglenormal;
4345         __m128 clipfrac[3];
4346         __m128 screen[4];
4347         DPSOFTRAST_State_Triangle *triangle;
4348         DPSOFTRAST_Texture *texture;
4349         DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4350         miny = thread->fb_scissor[1];
4351         maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4352         miny1 = bound(miny, thread->miny1, maxy);
4353         maxy1 = bound(miny, thread->maxy1, maxy);
4354         miny2 = bound(miny, thread->miny2, maxy);
4355         maxy2 = bound(miny, thread->maxy2, maxy);
4356         if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4357         {
4358                 if (!ATOMIC_DECREMENT(command->refcount))
4359                 {
4360                         if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4361                                 MM_FREE(command->arrays);
4362                 }
4363                 return;
4364         }
4365         minx = thread->fb_scissor[0];
4366         maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4367         fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4368         fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4369         viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4370         viewportscale = _mm_load_ps(thread->fb_viewportscale);
4371         screen[3] = _mm_setzero_ps();
4372         clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4373         for (i = 0;i < numtriangles;i++)
4374         {
4375                 const float *screencoord4f = command->arrays;
4376                 const float *arrays = screencoord4f + numvertices*4;
4377
4378                 // generate the 3 edges of this triangle
4379                 // generate spans for the triangle - switch based on left split or right split classification of triangle
4380                 if (element3s)
4381                 {
4382                         e[0] = element3s[i*3+0] - firstvertex;
4383                         e[1] = element3s[i*3+1] - firstvertex;
4384                         e[2] = element3s[i*3+2] - firstvertex;
4385                 }
4386                 else if (element3i)
4387                 {
4388                         e[0] = element3i[i*3+0] - firstvertex;
4389                         e[1] = element3i[i*3+1] - firstvertex;
4390                         e[2] = element3i[i*3+2] - firstvertex;
4391                 }
4392                 else
4393                 {
4394                         e[0] = i*3+0;
4395                         e[1] = i*3+1;
4396                         e[2] = i*3+2;
4397                 }
4398
4399 #define SKIPBACKFACE \
4400                 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4401                 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4402                 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4403                 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4404                                                                         _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4405                 switch(cullface) \
4406                 { \
4407                 case GL_BACK: \
4408                         if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4409                                 continue; \
4410                         break; \
4411                 case GL_FRONT: \
4412                         if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4413                                 continue; \
4414                         break; \
4415                 }
4416
4417 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4418                         clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4419                         { \
4420                                 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4421                                 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4422                         }
4423 #define CLIPPEDVERTEXCOPY(k,p1) \
4424                         screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4425
4426 #define GENATTRIBCOPY(attrib, p1) \
4427                 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4428 #define GENATTRIBLERP(attrib, p1, p2) \
4429                 { \
4430                         __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4431                         attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4432                 }
4433 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4434                 switch(clipcase) \
4435                 { \
4436                 default: \
4437                 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4438                 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4439                 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4440                 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4441                 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4442                 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4443                 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4444                 }
4445
4446                 if (! clipped)
4447                         goto notclipped;
4448
4449                 // calculate distance from nearplane
4450                 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4451                 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4452                 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4453                 if (clipdist[0] >= 0.0f)
4454                 {
4455                         if (clipdist[1] >= 0.0f)
4456                         {
4457                                 if (clipdist[2] >= 0.0f)
4458                                 {
4459                                 notclipped:
4460                                         // triangle is entirely in front of nearplane
4461                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4462                                         SKIPBACKFACE;
4463                                         numpoints = 3;
4464                                         clipcase = 0;
4465                                 }
4466                                 else
4467                                 {
4468                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4469                                         SKIPBACKFACE;
4470                                         numpoints = 4;
4471                                         clipcase = 1;
4472                                 }
4473                         }
4474                         else
4475                         {
4476                                 if (clipdist[2] >= 0.0f)
4477                                 {
4478                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4479                                         SKIPBACKFACE;
4480                                         numpoints = 4;
4481                                         clipcase = 2;
4482                                 }
4483                                 else
4484                                 {
4485                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4486                                         SKIPBACKFACE;
4487                                         numpoints = 3;
4488                                         clipcase = 3;
4489                                 }
4490                         }
4491                 }
4492                 else if (clipdist[1] >= 0.0f)
4493                 {
4494                         if (clipdist[2] >= 0.0f)
4495                         {
4496                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4497                                 SKIPBACKFACE;
4498                                 numpoints = 4;
4499                                 clipcase = 4;
4500                         }
4501                         else
4502                         {
4503                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4504                                 SKIPBACKFACE;
4505                                 numpoints = 3;
4506                                 clipcase = 5;
4507                         }
4508                 }
4509                 else if (clipdist[2] >= 0.0f)
4510                 {
4511                         CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4512                         SKIPBACKFACE;
4513                         numpoints = 3;
4514                         clipcase = 6;
4515                 }
4516                 else continue; // triangle is entirely behind nearplane
4517
4518                 {
4519                         // calculate integer y coords for triangle points
4520                         __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4521                                         screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4522                                         screenmin = _mm_min_epi16(screeni, screenir),
4523                                         screenmax = _mm_max_epi16(screeni, screenir);
4524                         screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4525                         screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4526                         screenmin = _mm_max_epi16(screenmin, fbmin);
4527                         screenmax = _mm_min_epi16(screenmax, fbmax);
4528                         // skip offscreen triangles
4529                         if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4530                                 continue;
4531                         starty = _mm_extract_epi16(screenmin, 1);
4532                         endy = _mm_extract_epi16(screenmax, 1)+1;
4533                         if (starty >= maxy1 && endy <= miny2)
4534                                 continue;
4535                         screeny = _mm_srai_epi32(screeni, 16);
4536                 }
4537
4538                 triangle = &thread->triangles[thread->numtriangles];
4539
4540                 // calculate attribute plans for triangle data...
4541                 // okay, this triangle is going to produce spans, we'd better project
4542                 // the interpolants now (this is what gives perspective texturing),
4543                 // this consists of simply multiplying all arrays by the W coord
4544                 // (which is basically 1/Z), which will be undone per-pixel
4545                 // (multiplying by Z again) to get the perspective-correct array
4546                 // values
4547                 {
4548                         __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4549                         __m128 mipedgescale, mipdensity;
4550                         attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4551                         attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4552                         attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4553                         attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4554                         attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4555                         w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4556                         w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4557                         w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4558                         attribedge1 = _mm_sub_ss(w0, w1);
4559                         attribedge2 = _mm_sub_ss(w2, w1);
4560                         attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4561                         attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4562                         x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4563                         y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4564                         attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4565                         _mm_store_ss(&triangle->w[0], attribxslope);
4566                         _mm_store_ss(&triangle->w[1], attribyslope);
4567                         _mm_store_ss(&triangle->w[2], attriborigin);
4568                         mipedgescale = _mm_setzero_ps();
4569                         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4570                         {
4571                                 __m128 attrib0, attrib1, attrib2;
4572                                 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4573                                 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4574                                         break;
4575                                 arrays += numvertices*4;
4576                                 GENATTRIBS(attrib0, attrib1, attrib2);
4577                                 attriborigin = _mm_mul_ps(attrib1, w1);
4578                                 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4579                                 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4580                                 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4581                                 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4582                                 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4583                                 _mm_stream_ps(triangle->attribs[k][0], attribxslope);
4584                                 _mm_stream_ps(triangle->attribs[k][1], attribyslope);
4585                                 _mm_stream_ps(triangle->attribs[k][2], attriborigin);
4586                                 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4587                                 {
4588                                         mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4589                                         mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4590                                         mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4591                                         mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4592                                 }
4593                         }
4594
4595                         memset(triangle->mip, 0, sizeof(triangle->mip));
4596                         for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4597                         {
4598                                 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4599                                 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4600                                         break;
4601                                 texture = thread->texbound[texunit];
4602                                 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4603                                 {
4604                                         mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4605                                         mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4606                                         mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4607                                         mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4608                                         // this will be multiplied in the texturing routine by the texture resolution
4609                                         y = _mm_cvtss_si32(mipdensity);
4610                                         if (y > 0)
4611                                         {
4612                                                 y = (int)(log((float)y)*0.5f/M_LN2);
4613                                                 if (y > texture->mipmaps - 1)
4614                                                         y = texture->mipmaps - 1;
4615                                                 triangle->mip[texunit] = y;
4616                                         }
4617                                 }
4618                         }
4619                 }
4620         
4621                 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
4622                 for (; y < bandy;)
4623                 {
4624                         __m128 xcoords, xslope;
4625                         __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
4626                         int yccmask = _mm_movemask_epi8(ycc);
4627                         int edge0p, edge0n, edge1p, edge1n;
4628                         int nexty;
4629                         if (numpoints == 4)
4630                         {
4631                                 switch(yccmask)
4632                                 {
4633                                 default:
4634                                 case 0xFFFF: /*0000*/ y = endy; continue;
4635                                 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
4636                                 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4637                                 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
4638                                 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
4639                                 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
4640                                 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
4641                                 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
4642                                 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
4643                                 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
4644                                 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
4645                                 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
4646                                 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
4647                                 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4648                                 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
4649                                 case 0x0000: /*1111*/ y++; continue;
4650                                 }
4651                         }
4652                         else
4653                         {
4654                                 switch(yccmask)
4655                                 {
4656                                 default:
4657                                 case 0xFFFF: /*000*/ y = endy; continue;
4658                                 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
4659                                 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4660                                 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
4661                                 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
4662                                 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4663                                 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
4664                                 case 0x0000: /*111*/ y++; continue;
4665                                 }
4666                         }
4667                         ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
4668                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
4669                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
4670                         nexty = _mm_extract_epi16(ycc, 0);
4671                         if (nexty >= bandy) nexty = bandy-1;
4672                         xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
4673                         xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
4674                         xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
4675                                                                 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
4676                         xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
4677                         if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
4678                         {
4679                                 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
4680                                 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
4681                         }
4682                         for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
4683                         {
4684                                 int startx, endx, offset;
4685                                 startx = _mm_cvtss_si32(xcoords);
4686                                 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
4687                                 if (startx < minx) 
4688                                 {
4689                                         if (startx < 0) startx = 0;
4690                                         startx += (minx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
4691                                 }
4692                                 if (endx > maxx) endx = maxx;
4693                                 if (startx >= endx) continue;
4694                                 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
4695                                 {
4696                                         DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
4697                                         span->triangle = thread->numtriangles;
4698                                         span->x = offset;
4699                                         span->y = y;
4700                                         span->startx = max(minx - offset, 0);
4701                                         span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
4702                                         if (span->startx >= span->endx)
4703                                                 continue; 
4704                                         if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
4705                                                 DPSOFTRAST_Draw_ProcessSpans(thread);
4706                                 }
4707                         }
4708                 }
4709
4710                 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
4711                 {
4712                         DPSOFTRAST_Draw_ProcessSpans(thread);
4713                         thread->numtriangles = 0;
4714                 }
4715         }
4716
4717         if (!ATOMIC_DECREMENT(command->refcount))
4718         {
4719                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4720                         MM_FREE(command->arrays);
4721         }
4722
4723         if (thread->numspans > 0 || thread->numtriangles > 0)
4724         {
4725                 DPSOFTRAST_Draw_ProcessSpans(thread);
4726                 thread->numtriangles = 0;
4727         }
4728 #endif
4729 }
4730
4731 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4732 {
4733         int i;
4734         int j;
4735         int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
4736         int datasize = 2*numvertices*sizeof(float[4]);
4737         DPSOFTRAST_Command_Draw *command;
4738         unsigned char *data;
4739         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4740         {
4741                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4742                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4743                         break;
4744                 datasize += numvertices*sizeof(float[4]);
4745         }
4746         if (element3s)
4747                 datasize += numtriangles*sizeof(unsigned short[3]);
4748         else if (element3i)
4749                 datasize += numtriangles*sizeof(int[3]);
4750         datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
4751         if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
4752         {
4753                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
4754                 data = (unsigned char *)MM_CALLOC(datasize, 1);
4755         }
4756         else
4757         {
4758                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
4759                 data = (unsigned char *)command + commandsize;
4760         }
4761         command->firstvertex = firstvertex;
4762         command->numvertices = numvertices;
4763         command->numtriangles = numtriangles;
4764         command->arrays = (float *)data;
4765         memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
4766         dpsoftrast.firstvertex = firstvertex;
4767         dpsoftrast.numvertices = numvertices;
4768         dpsoftrast.screencoord4f = (float *)data;
4769         data += numvertices*sizeof(float[4]);
4770         dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
4771         data += numvertices*sizeof(float[4]);
4772         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4773         {
4774                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4775                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4776                         break;
4777                 dpsoftrast.post_array4f[j] = (float *)data;
4778                 data += numvertices*sizeof(float[4]);
4779         }
4780         command->element3i = NULL;
4781         command->element3s = NULL;
4782         if (element3s)
4783         {
4784                 command->element3s = (unsigned short *)data;
4785                 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
4786         }
4787         else if (element3i)
4788         {
4789                 command->element3i = (int *)data;
4790                 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
4791         }
4792         return command;
4793 }
4794
4795 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4796 {
4797         DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
4798         DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
4799         command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
4800         command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
4801         if (command->starty >= command->endy)
4802         {
4803                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4804                         MM_FREE(command->arrays);
4805                 DPSOFTRAST_UndoCommand(command->commandsize);
4806                 return;
4807         }
4808         command->clipped = dpsoftrast.drawclipped;
4809         command->refcount = dpsoftrast.numthreads;
4810
4811         if (dpsoftrast.usethreads)
4812         {
4813                 int i;
4814                 DPSOFTRAST_Draw_SyncCommands();
4815                 for (i = 0; i < dpsoftrast.numthreads; i++)
4816                 {
4817                         DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
4818                         if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
4819                                 Thread_CondSignal(thread->drawcond);
4820                 }
4821         }
4822         else
4823         {
4824                 DPSOFTRAST_Draw_FlushThreads();
4825         }
4826 }
4827  
4828 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
4829 {
4830         int commandoffset = thread->commandoffset;
4831         while (commandoffset != endoffset)
4832         {
4833                 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
4834                 switch (command->opcode)
4835                 {
4836 #define INTERPCOMMAND(name) \
4837                 case DPSOFTRAST_OPCODE_##name : \
4838                         DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
4839                         commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
4840                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
4841                                 commandoffset = 0; \
4842                         break;
4843                 INTERPCOMMAND(Viewport)
4844                 INTERPCOMMAND(ClearColor)
4845                 INTERPCOMMAND(ClearDepth)
4846                 INTERPCOMMAND(ColorMask)
4847                 INTERPCOMMAND(DepthTest)
4848                 INTERPCOMMAND(ScissorTest)
4849                 INTERPCOMMAND(Scissor)
4850                 INTERPCOMMAND(BlendFunc)
4851                 INTERPCOMMAND(BlendSubtract)
4852                 INTERPCOMMAND(DepthMask)
4853                 INTERPCOMMAND(DepthFunc)
4854                 INTERPCOMMAND(DepthRange)
4855                 INTERPCOMMAND(PolygonOffset)
4856                 INTERPCOMMAND(CullFace)
4857                 INTERPCOMMAND(AlphaTest)
4858                 INTERPCOMMAND(AlphaFunc)
4859                 INTERPCOMMAND(SetTexture)
4860                 INTERPCOMMAND(SetShader)
4861                 INTERPCOMMAND(Uniform4f)
4862                 INTERPCOMMAND(UniformMatrix4f)
4863                 INTERPCOMMAND(Uniform1i)
4864
4865                 case DPSOFTRAST_OPCODE_Draw:
4866                         DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
4867                         commandoffset += command->commandsize;
4868                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
4869                                 commandoffset = 0;
4870                         thread->commandoffset = commandoffset;
4871                         break;
4872
4873                 case DPSOFTRAST_OPCODE_Reset:
4874                         commandoffset = 0;
4875                         break;
4876                 }
4877         }
4878         thread->commandoffset = commandoffset;
4879 }
4880
4881 static int DPSOFTRAST_Draw_Thread(void *data)
4882 {
4883         DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
4884         while(thread->index >= 0)
4885         {
4886                 if (thread->commandoffset != dpsoftrast.drawcommand)
4887                 {
4888                         DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);      
4889                 }
4890                 else 
4891                 {
4892                         Thread_LockMutex(thread->drawmutex);
4893                         if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
4894                         {
4895                                 if (thread->waiting) Thread_CondSignal(thread->waitcond);
4896                                 thread->starving = true;
4897                                 Thread_CondWait(thread->drawcond, thread->drawmutex);
4898                                 thread->starving = false;
4899                         }
4900                         Thread_UnlockMutex(thread->drawmutex);
4901                 }
4902         }   
4903         return 0;
4904 }
4905
4906 static void DPSOFTRAST_Draw_FlushThreads(void)
4907 {
4908         DPSOFTRAST_State_Thread *thread;
4909         int i;
4910         DPSOFTRAST_Draw_SyncCommands();
4911         if (dpsoftrast.usethreads) 
4912         {
4913                 for (i = 0; i < dpsoftrast.numthreads; i++)
4914                 {
4915                         thread = &dpsoftrast.threads[i];
4916                         if (thread->commandoffset != dpsoftrast.drawcommand)
4917                         {
4918                                 Thread_LockMutex(thread->drawmutex);
4919                                 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
4920                                         Thread_CondSignal(thread->drawcond);
4921                                 Thread_UnlockMutex(thread->drawmutex);
4922                         }
4923                 }
4924                 for (i = 0; i < dpsoftrast.numthreads; i++)
4925                 {
4926                         thread = &dpsoftrast.threads[i];
4927                         if (thread->commandoffset != dpsoftrast.drawcommand)
4928                         {
4929                                 Thread_LockMutex(thread->drawmutex);
4930                                 if (thread->commandoffset != dpsoftrast.drawcommand)
4931                                 {
4932                                         thread->waiting = true;
4933                                         Thread_CondWait(thread->waitcond, thread->drawmutex);
4934                                         thread->waiting = false;
4935                                 }
4936                                 Thread_UnlockMutex(thread->drawmutex);
4937                         }
4938                 }
4939         }
4940         else
4941         {
4942                 for (i = 0; i < dpsoftrast.numthreads; i++)
4943                 {
4944                         thread = &dpsoftrast.threads[i];
4945                         if (thread->commandoffset != dpsoftrast.drawcommand)
4946                                 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
4947                 }
4948         }
4949         dpsoftrast.commandpool.usedcommands = 0;
4950 }
4951
4952 void DPSOFTRAST_Flush(void)
4953 {
4954         DPSOFTRAST_Draw_FlushThreads();
4955 }
4956
4957 void DPSOFTRAST_Finish(void)
4958 {
4959         DPSOFTRAST_Flush();
4960 }
4961
4962 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
4963 {
4964         int i;
4965         union
4966         {
4967                 int i;
4968                 unsigned char b[4];
4969         }
4970         u;
4971         u.i = 1;
4972         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
4973         dpsoftrast.bigendian = u.b[3];
4974         dpsoftrast.fb_width = width;
4975         dpsoftrast.fb_height = height;
4976         dpsoftrast.fb_depthpixels = depthpixels;
4977         dpsoftrast.fb_colorpixels[0] = colorpixels;
4978         dpsoftrast.fb_colorpixels[1] = NULL;
4979         dpsoftrast.fb_colorpixels[1] = NULL;
4980         dpsoftrast.fb_colorpixels[1] = NULL;
4981         dpsoftrast.viewport[0] = 0;
4982         dpsoftrast.viewport[1] = 0;
4983         dpsoftrast.viewport[2] = dpsoftrast.fb_width;
4984         dpsoftrast.viewport[3] = dpsoftrast.fb_height;
4985         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
4986         dpsoftrast.texture_firstfree = 1;
4987         dpsoftrast.texture_end = 1;
4988         dpsoftrast.texture_max = 0;
4989         dpsoftrast.color[0] = 1;
4990         dpsoftrast.color[1] = 1;
4991         dpsoftrast.color[2] = 1;
4992         dpsoftrast.color[3] = 1;
4993         dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
4994         dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
4995         dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
4996         dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
4997         for (i = 0; i < dpsoftrast.numthreads; i++)
4998         {
4999                 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5000                 thread->index = i;
5001                 thread->cullface = GL_BACK;
5002                 thread->colormask[1] = 1;
5003                 thread->colormask[2] = 1;
5004                 thread->colormask[3] = 1;
5005                 thread->blendfunc[0] = GL_ONE;
5006                 thread->blendfunc[1] = GL_ZERO;
5007                 thread->depthmask = true;
5008                 thread->depthtest = true;
5009                 thread->depthfunc = GL_LEQUAL;
5010                 thread->scissortest = false;
5011                 thread->alphatest = false;
5012                 thread->alphafunc = GL_GREATER;
5013                 thread->alphavalue = 0.5f;
5014                 thread->viewport[0] = 0;
5015                 thread->viewport[1] = 0;
5016                 thread->viewport[2] = dpsoftrast.fb_width;
5017                 thread->viewport[3] = dpsoftrast.fb_height;
5018                 thread->scissor[0] = 0;
5019                 thread->scissor[1] = 0;
5020                 thread->scissor[2] = dpsoftrast.fb_width;
5021                 thread->scissor[3] = dpsoftrast.fb_height;
5022                 thread->depthrange[0] = 0;
5023                 thread->depthrange[1] = 1;
5024                 thread->polygonoffset[0] = 0;
5025                 thread->polygonoffset[1] = 0;
5026         
5027                 if (dpsoftrast.interlace)
5028                 {
5029                         thread->miny1 = (i*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5030                         thread->maxy1 = ((i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5031                         thread->miny2 = ((dpsoftrast.numthreads+i)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5032                         thread->maxy2 = ((dpsoftrast.numthreads+i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5033                 }
5034                 else
5035                 {
5036                         thread->miny1 = thread->miny2 = (i*dpsoftrast.fb_height)/dpsoftrast.numthreads;
5037                         thread->maxy1 = thread->maxy2 = ((i+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
5038                 }
5039
5040                 thread->numspans = 0;
5041                 thread->numtriangles = 0;
5042                 thread->commandoffset = 0;
5043                 thread->waiting = false;
5044                 thread->starving = false;
5045            
5046                 thread->validate = -1;
5047                 DPSOFTRAST_Validate(thread, -1);
5048  
5049                 if (dpsoftrast.usethreads)
5050                 {
5051                         thread->waitcond = Thread_CreateCond();
5052                         thread->drawcond = Thread_CreateCond();
5053                         thread->drawmutex = Thread_CreateMutex();
5054                         thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5055                 }
5056         }
5057         return 0;
5058 }
5059
5060 void DPSOFTRAST_Shutdown(void)
5061 {
5062         int i;
5063         if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5064         {
5065                 DPSOFTRAST_State_Thread *thread;
5066                 for (i = 0; i < dpsoftrast.numthreads; i++)
5067                 {
5068                         thread = &dpsoftrast.threads[i];
5069                         Thread_LockMutex(thread->drawmutex);
5070                         thread->index = -1;
5071                         Thread_CondSignal(thread->drawcond);
5072                         Thread_UnlockMutex(thread->drawmutex);
5073                         Thread_WaitThread(thread->thread, 0);
5074                         Thread_DestroyCond(thread->waitcond);
5075                         Thread_DestroyCond(thread->drawcond);
5076                         Thread_DestroyMutex(thread->drawmutex);
5077                 }
5078         }
5079         for (i = 0;i < dpsoftrast.texture_end;i++)
5080                 if (dpsoftrast.texture[i].bytes)
5081                         MM_FREE(dpsoftrast.texture[i].bytes);
5082         if (dpsoftrast.texture)
5083                 free(dpsoftrast.texture);
5084         if (dpsoftrast.threads)
5085                 MM_FREE(dpsoftrast.threads);
5086         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5087 }
5088