]> git.xonotic.org Git - xonotic/darkplaces.git/blob - dpsoftrast.c
fix GL20TU list for deluxemapping
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "thread.h"
7 #include "dpsoftrast.h"
8
9 #ifndef __cplusplus
10 typedef qboolean bool;
11 #endif
12
13 #define ALIGN_SIZE 16
14 #define ATOMIC_SIZE 32
15
16 #ifdef SSE2_PRESENT
17         #if defined(__APPLE__)
18                 #include <libkern/OSAtomic.h>
19                 #define ALIGN(var) var __attribute__((__aligned__(16)))
20                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
21                 #define MEMORY_BARRIER (_mm_sfence())
22                 #define ATOMIC_COUNTER volatile int32_t 
23                 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
24                 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
25                 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
26         #elif defined(__GNUC__)
27                 #define ALIGN(var) var __attribute__((__aligned__(16)))
28                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
29                 #define MEMORY_BARRIER (_mm_sfence())
30                 //(__sync_synchronize())
31                 #define ATOMIC_COUNTER volatile int
32                 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
33                 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
34                 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
35         #elif defined(_MSC_VER)
36                 #define ALIGN(var) __declspec(align(16)) var
37                 #define ATOMIC(var) __declspec(align(32)) var
38                 #define MEMORY_BARRIER (_mm_sfence())
39                 //(MemoryBarrier())
40                 #define ATOMIC_COUNTER volatile LONG
41                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
42                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
43                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
44         #endif
45 #endif
46
47 #ifndef ALIGN
48 #define ALIGN(var) var
49 #endif
50 #ifndef ATOMIC
51 #define ATOMIC(var) var
52 #endif
53 #ifndef MEMORY_BARRIER
54 #define MEMORY_BARRIER ((void)0)
55 #endif
56 #ifndef ATOMIC_COUNTER
57 #define ATOMIC_COUNTER int
58 #endif
59 #ifndef ATOMIC_INCREMENT
60 #define ATOMIC_INCREMENT(counter) (++(counter))
61 #endif
62 #ifndef ATOMIC_DECREMENT
63 #define ATOMIC_DECREMENT(counter) (--(counter))
64 #endif
65 #ifndef ATOMIC_ADD
66 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
67 #endif
68
69 #ifdef SSE2_PRESENT
70 #include <emmintrin.h>
71
72 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
73
74 static void *MM_CALLOC(size_t nmemb, size_t size)
75 {
76         void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
77         if (ptr != NULL) memset(ptr, 0, nmemb*size);
78         return ptr;
79 }
80
81 #define MM_FREE _mm_free
82 #else
83 #define MM_MALLOC(size) malloc(size)
84 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
85 #define MM_FREE free
86 #endif
87
88 typedef enum DPSOFTRAST_ARRAY_e
89 {
90         DPSOFTRAST_ARRAY_POSITION,
91         DPSOFTRAST_ARRAY_COLOR,
92         DPSOFTRAST_ARRAY_TEXCOORD0,
93         DPSOFTRAST_ARRAY_TEXCOORD1,
94         DPSOFTRAST_ARRAY_TEXCOORD2,
95         DPSOFTRAST_ARRAY_TEXCOORD3,
96         DPSOFTRAST_ARRAY_TEXCOORD4,
97         DPSOFTRAST_ARRAY_TEXCOORD5,
98         DPSOFTRAST_ARRAY_TEXCOORD6,
99         DPSOFTRAST_ARRAY_TEXCOORD7,
100         DPSOFTRAST_ARRAY_TOTAL
101 }
102 DPSOFTRAST_ARRAY;
103
104 typedef struct DPSOFTRAST_Texture_s
105 {
106         int flags;
107         int width;
108         int height;
109         int depth;
110         int sides;
111         DPSOFTRAST_TEXTURE_FILTER filter;
112         int mipmaps;
113         int size;
114         ATOMIC_COUNTER binds;
115         unsigned char *bytes;
116         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
117 }
118 DPSOFTRAST_Texture;
119
120 #define COMMAND_SIZE ALIGN_SIZE
121 #define COMMAND_ALIGN(var) ALIGN(var)
122
123 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
124 {
125         unsigned char opcode;
126         unsigned short commandsize;
127 }
128 DPSOFTRAST_Command);
129
130 enum { DPSOFTRAST_OPCODE_Reset = 0 };
131
132 #define DEFCOMMAND(opcodeval, name, fields) \
133         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
134         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
135         { \
136                 unsigned char opcode; \
137                 unsigned short commandsize; \
138                 fields \
139         } DPSOFTRAST_Command_##name );
140
141 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
142 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
143
144 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
145 {
146         int freecommand;
147         int usedcommands;
148         ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
149 }
150 DPSOFTRAST_State_Command_Pool);
151
152 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
153 {
154         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
155         float w[3];
156         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
157 }
158 DPSOFTRAST_State_Triangle);
159
160 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
161         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
162         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
163                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
164                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
165 }
166 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
167         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
168         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
169         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
170         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
171         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
172         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
173         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
174         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
175 }
176                                         
177 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
178
179 typedef ALIGN(struct DPSOFTRAST_State_Span_s
180 {
181         int triangle; // triangle this span was generated by
182         int x; // framebuffer x coord
183         int y; // framebuffer y coord
184         int startx; // usable range (according to pixelmask)
185         int endx; // usable range (according to pixelmask)
186         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
187 }
188 DPSOFTRAST_State_Span);
189
190 #define DPSOFTRAST_DRAW_MAXSPANS 1024
191 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
192
193 #define DPSOFTRAST_VALIDATE_FB 1
194 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
195 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
196 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
197
198 typedef enum DPSOFTRAST_BLENDMODE_e
199 {
200         DPSOFTRAST_BLENDMODE_OPAQUE,
201         DPSOFTRAST_BLENDMODE_ALPHA,
202         DPSOFTRAST_BLENDMODE_ADDALPHA,
203         DPSOFTRAST_BLENDMODE_ADD,
204         DPSOFTRAST_BLENDMODE_INVMOD,
205         DPSOFTRAST_BLENDMODE_MUL,
206         DPSOFTRAST_BLENDMODE_MUL2,
207         DPSOFTRAST_BLENDMODE_SUBALPHA,
208         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
209         DPSOFTRAST_BLENDMODE_INVADD,
210         DPSOFTRAST_BLENDMODE_TOTAL
211 }
212 DPSOFTRAST_BLENDMODE;
213
214 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
215 {
216         void *thread;
217         int index;
218         
219         int cullface;
220         int colormask[4];
221         int blendfunc[2];
222         int blendsubtract;
223         int depthmask;
224         int depthtest;
225         int depthfunc;
226         int scissortest;
227         int alphatest;
228         int alphafunc;
229         float alphavalue;
230         int viewport[4];
231         int scissor[4];
232         float depthrange[2];
233         float polygonoffset[2];
234
235         int shader_mode;
236         int shader_permutation;
237
238         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
239         
240         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
241         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
242
243         // DPSOFTRAST_VALIDATE_ flags
244         int validate;
245
246         // derived values (DPSOFTRAST_VALIDATE_FB)
247         int fb_colormask;
248         int fb_scissor[4];
249         ALIGN(float fb_viewportcenter[4]);
250         ALIGN(float fb_viewportscale[4]);
251
252         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
253         int fb_depthfunc;
254
255         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
256         int fb_blendmode;
257
258         // band boundaries
259         int miny1;
260         int maxy1;
261         int miny2;
262         int maxy2;
263
264         ATOMIC(volatile int commandoffset);
265
266         volatile bool waiting;
267         volatile bool starving;
268         void *waitcond;
269         void *drawcond;
270         void *drawmutex;
271
272         int numspans;
273         int numtriangles;
274         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
275         DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
276 }
277 DPSOFTRAST_State_Thread);
278
279 typedef ATOMIC(struct DPSOFTRAST_State_s
280 {
281         int fb_width;
282         int fb_height;
283         unsigned int *fb_depthpixels;
284         unsigned int *fb_colorpixels[4];
285
286         int viewport[4];
287         ALIGN(float fb_viewportcenter[4]);
288         ALIGN(float fb_viewportscale[4]);
289
290         float color[4];
291         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
292         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
293
294         const float *pointer_vertex3f;
295         const float *pointer_color4f;
296         const unsigned char *pointer_color4ub;
297         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
298         int stride_vertex;
299         int stride_color;
300         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
301         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
302         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
303
304         int firstvertex;
305         int numvertices;
306         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
307         float *screencoord4f;
308         int drawstarty;
309         int drawendy;
310         int drawclipped;
311         
312         int shader_mode;
313         int shader_permutation;
314
315         int texture_max;
316         int texture_end;
317         int texture_firstfree;
318         DPSOFTRAST_Texture *texture;
319
320         int bigendian;
321
322         // error reporting
323         const char *errorstring;
324
325         bool usethreads;
326         int interlace;
327         int numthreads;
328         DPSOFTRAST_State_Thread *threads;
329
330         ATOMIC(volatile int drawcommand);
331
332         DPSOFTRAST_State_Command_Pool commandpool;
333 }
334 DPSOFTRAST_State);
335
336 DPSOFTRAST_State dpsoftrast;
337
338 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
339 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
340 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
341 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
342 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
343
344 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
345 {
346         fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
347         fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
348         fb_viewportcenter[3] = 0.5f;
349         fb_viewportcenter[0] = 0.0f;
350         fb_viewportscale[1] = 0.5f * viewport[2];
351         fb_viewportscale[2] = -0.5f * viewport[3];
352         fb_viewportscale[3] = 0.5f;
353         fb_viewportscale[0] = 1.0f;
354 }
355
356 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
357 {
358         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
359         // and viewport projection values
360         int x1, x2;
361         int y1, y2;
362         x1 = thread->scissor[0];
363         x2 = thread->scissor[0] + thread->scissor[2];
364         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
365         y2 = dpsoftrast.fb_height - thread->scissor[1];
366         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
367         if (x1 < 0) x1 = 0;
368         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
369         if (y1 < 0) y1 = 0;
370         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
371         thread->fb_scissor[0] = x1;
372         thread->fb_scissor[1] = y1;
373         thread->fb_scissor[2] = x2 - x1;
374         thread->fb_scissor[3] = y2 - y1;
375
376         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
377 }
378
379 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
380 {
381         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
382 }
383
384 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
385 {
386         if (thread->blendsubtract)
387         {
388                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
389                 {
390                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
391                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
392                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
393                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
394                 }
395         }
396         else
397         {       
398                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
399                 {
400                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
401                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
402                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
403                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
404                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
405                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
406                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
407                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
408                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
409                 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
410                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
411                 }
412         }
413 }
414
415 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
416
417 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
418 {
419         mask &= thread->validate;
420         if (!mask)
421                 return;
422         if (mask & DPSOFTRAST_VALIDATE_FB)
423         {
424                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
425                 DPSOFTRAST_RecalcFB(thread);
426         }
427         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
428         {
429                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
430                 DPSOFTRAST_RecalcDepthFunc(thread);
431         }
432         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
433         {
434                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
435                 DPSOFTRAST_RecalcBlendFunc(thread);
436         }
437 }
438
439 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
440 {
441         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
442                 return &dpsoftrast.texture[index];
443         return NULL;
444 }
445
446 static void DPSOFTRAST_Texture_Grow(void)
447 {
448         DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
449         DPSOFTRAST_State_Thread *thread;
450         int i;
451         int j;
452         DPSOFTRAST_Flush();
453         // expand texture array as needed
454         if (dpsoftrast.texture_max < 1024)
455                 dpsoftrast.texture_max = 1024;
456         else
457                 dpsoftrast.texture_max *= 2;
458         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
459         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
460                 if (dpsoftrast.texbound[i])
461                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
462         for (j = 0; j < dpsoftrast.numthreads; j++)
463         {
464                 thread = &dpsoftrast.threads[j];
465                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
466                         if (thread->texbound[i])
467                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
468         }
469 }
470
471 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
472 {
473         int w;
474         int h;
475         int d;
476         int size;
477         int s;
478         int texnum;
479         int mipmaps;
480         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
481         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
482         DPSOFTRAST_Texture *texture;
483         if (width*height*depth < 1)
484         {
485                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
486                 return 0;
487         }
488         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
489         {
490                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
491                 return 0;
492         }
493         switch(texformat)
494         {
495         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
496         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
497         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
498                 break;
499         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
500                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
501                 {
502                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
503                         return 0;
504                 }
505                 if (depth != 1)
506                 {
507                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
508                         return 0;
509                 }
510                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
511                 {
512                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
513                         return 0;
514                 }
515                 break;
516         }
517         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
518         {
519                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
520                 return 0;
521         }
522         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
523         {
524                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
525                 return 0;
526         }
527         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
528         {
529                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
530                 return 0;
531         }
532         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
533         {
534                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
535                 return 0;
536         }
537         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
538         {
539                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
540                 return 0;
541         }
542         // find first empty slot in texture array
543         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
544                 if (!dpsoftrast.texture[texnum].bytes)
545                         break;
546         dpsoftrast.texture_firstfree = texnum + 1;
547         if (dpsoftrast.texture_max <= texnum)
548                 DPSOFTRAST_Texture_Grow();
549         if (dpsoftrast.texture_end <= texnum)
550                 dpsoftrast.texture_end = texnum + 1;
551         texture = &dpsoftrast.texture[texnum];
552         memset(texture, 0, sizeof(*texture));
553         texture->flags = flags;
554         texture->width = width;
555         texture->height = height;
556         texture->depth = depth;
557         texture->sides = sides;
558         texture->binds = 0;
559         w = width;
560         h = height;
561         d = depth;
562         size = 0;
563         mipmaps = 0;
564         w = width;
565         h = height;
566         d = depth;
567         for (;;)
568         {
569                 s = w * h * d * sides * 4;
570                 texture->mipmap[mipmaps][0] = size;
571                 texture->mipmap[mipmaps][1] = s;
572                 texture->mipmap[mipmaps][2] = w;
573                 texture->mipmap[mipmaps][3] = h;
574                 texture->mipmap[mipmaps][4] = d;
575                 size += s;
576                 mipmaps++;
577                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
578                         break;
579                 if (w > 1) w >>= 1;
580                 if (h > 1) h >>= 1;
581                 if (d > 1) d >>= 1;
582         }
583         texture->mipmaps = mipmaps;
584         texture->size = size;
585
586         // allocate the pixels now
587         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
588
589         return texnum;
590 }
591 void DPSOFTRAST_Texture_Free(int index)
592 {
593         DPSOFTRAST_Texture *texture;
594         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
595         if (texture->binds)
596                 DPSOFTRAST_Flush();
597         if (texture->bytes)
598                 MM_FREE(texture->bytes);
599         texture->bytes = NULL;
600         memset(texture, 0, sizeof(*texture));
601         // adjust the free range and used range
602         if (dpsoftrast.texture_firstfree > index)
603                 dpsoftrast.texture_firstfree = index;
604         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
605                 dpsoftrast.texture_end--;
606 }
607 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
608 {
609         int i, x, y, z, w, layer0, layer1, row0, row1;
610         unsigned char *o, *i0, *i1, *i2, *i3;
611         DPSOFTRAST_Texture *texture;
612         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
613         if (texture->mipmaps <= 1)
614                 return;
615         for (i = 1;i < texture->mipmaps;i++)
616         {
617                 for (z = 0;z < texture->mipmap[i][4];z++)
618                 {
619                         layer0 = z*2;
620                         layer1 = z*2+1;
621                         if (layer1 >= texture->mipmap[i-1][4])
622                                 layer1 = texture->mipmap[i-1][4]-1;
623                         for (y = 0;y < texture->mipmap[i][3];y++)
624                         {
625                                 row0 = y*2;
626                                 row1 = y*2+1;
627                                 if (row1 >= texture->mipmap[i-1][3])
628                                         row1 = texture->mipmap[i-1][3]-1;
629                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
630                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
631                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
632                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
633                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
634                                 w = texture->mipmap[i][2];
635                                 if (layer1 > layer0)
636                                 {
637                                         if (texture->mipmap[i-1][2] > 1)
638                                         {
639                                                 // average 3D texture
640                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
641                                                 {
642                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
643                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
644                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
645                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
646                                                 }
647                                         }
648                                         else
649                                         {
650                                                 // average 3D mipmap with parent width == 1
651                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
652                                                 {
653                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
654                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
655                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
656                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
657                                                 }
658                                         }
659                                 }
660                                 else
661                                 {
662                                         if (texture->mipmap[i-1][2] > 1)
663                                         {
664                                                 // average 2D texture (common case)
665                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
666                                                 {
667                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
668                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
669                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
670                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
671                                                 }
672                                         }
673                                         else
674                                         {
675                                                 // 2D texture with parent width == 1
676                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
677                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
678                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
679                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
680                                         }
681                                 }
682                         }
683                 }
684         }
685 }
686 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
687 {
688         DPSOFTRAST_Texture *texture;
689         unsigned char *dst;
690         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
691         if (texture->binds)
692                 DPSOFTRAST_Flush();
693         dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
694         while (blockheight > 0)
695         {
696                 memcpy(dst, pixels, blockwidth * 4);
697                 pixels += blockwidth * 4;
698                 dst += texture->mipmap[0][2] * 4;
699                 blockheight--;
700         }
701         DPSOFTRAST_Texture_CalculateMipmaps(index);
702 }
703 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
704 {
705         DPSOFTRAST_Texture *texture;
706         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
707         if (texture->binds)
708                 DPSOFTRAST_Flush();
709         memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
710         DPSOFTRAST_Texture_CalculateMipmaps(index);
711 }
712 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
713 {
714         DPSOFTRAST_Texture *texture;
715         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
716         return texture->mipmap[mip][2];
717 }
718 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
719 {
720         DPSOFTRAST_Texture *texture;
721         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
722         return texture->mipmap[mip][3];
723 }
724 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
725 {
726         DPSOFTRAST_Texture *texture;
727         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
728         return texture->mipmap[mip][4];
729 }
730 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
731 {
732         DPSOFTRAST_Texture *texture;
733         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
734         if (texture->binds)
735                 DPSOFTRAST_Flush();
736         return texture->bytes + texture->mipmap[mip][0];
737 }
738 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
739 {
740         DPSOFTRAST_Texture *texture;
741         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
742         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
743         {
744                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
745                 return;
746         }
747         if (texture->binds)
748                 DPSOFTRAST_Flush();
749         texture->filter = filter;
750 }
751
752 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
753 {
754         if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
755                 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
756                 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
757                 DPSOFTRAST_Flush();
758         dpsoftrast.fb_width = width;
759         dpsoftrast.fb_height = height;
760         dpsoftrast.fb_depthpixels = depthpixels;
761         dpsoftrast.fb_colorpixels[0] = colorpixels0;
762         dpsoftrast.fb_colorpixels[1] = colorpixels1;
763         dpsoftrast.fb_colorpixels[2] = colorpixels2;
764         dpsoftrast.fb_colorpixels[3] = colorpixels3;
765 }
766
767 static void DPSOFTRAST_Draw_FlushThreads(void);
768
769 static void DPSOFTRAST_Draw_SyncCommands(void)
770 {
771         if(dpsoftrast.usethreads) MEMORY_BARRIER;
772         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
773 }
774
775 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
776 {
777         DPSOFTRAST_State_Thread *thread;
778         int i;
779         int freecommand = dpsoftrast.commandpool.freecommand;
780         int usedcommands = dpsoftrast.commandpool.usedcommands;
781         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
782                 return;
783         DPSOFTRAST_Draw_SyncCommands();
784         for(;;)
785         {
786                 int waitindex = -1;
787                 int commandoffset;
788                 usedcommands = 0;
789                 for (i = 0; i < dpsoftrast.numthreads; i++)
790                 {
791                         thread = &dpsoftrast.threads[i]; 
792                         commandoffset = freecommand - thread->commandoffset;
793                         if (commandoffset < 0)
794                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
795                         if (commandoffset > usedcommands)
796                         {
797                                 waitindex = i;
798                                 usedcommands = commandoffset;
799                         }
800                 }
801                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
802                         break;
803                 thread = &dpsoftrast.threads[waitindex];
804                 Thread_LockMutex(thread->drawmutex);
805                 if (thread->commandoffset != dpsoftrast.drawcommand)
806                 {
807                         thread->waiting = true;
808                         if (thread->starving) Thread_CondSignal(thread->drawcond);
809                         Thread_CondWait(thread->waitcond, thread->drawmutex);
810                         thread->waiting = false;
811                 }
812                 Thread_UnlockMutex(thread->drawmutex);
813         }
814         dpsoftrast.commandpool.usedcommands = usedcommands;
815 }
816
817 #define DPSOFTRAST_ALIGNCOMMAND(size) \
818         ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
819 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
820         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
821
822 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
823 {
824         DPSOFTRAST_Command *command;
825         int freecommand = dpsoftrast.commandpool.freecommand;
826         int usedcommands = dpsoftrast.commandpool.usedcommands;
827         int extra = sizeof(DPSOFTRAST_Command);
828         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
829                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
830         if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
831         {
832                 if (dpsoftrast.usethreads)
833                         DPSOFTRAST_Draw_FreeCommandPool(size + extra);
834                 else
835                         DPSOFTRAST_Draw_FlushThreads();
836                 freecommand = dpsoftrast.commandpool.freecommand;
837                 usedcommands = dpsoftrast.commandpool.usedcommands;
838         }
839         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
840         {
841                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
842                 command->opcode = DPSOFTRAST_OPCODE_Reset;
843                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
844                 freecommand = 0;
845         }
846         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
847         command->opcode = opcode;
848         command->commandsize = size;
849         freecommand += size;
850         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
851                 freecommand = 0;
852         dpsoftrast.commandpool.freecommand = freecommand;
853         dpsoftrast.commandpool.usedcommands = usedcommands + size;
854         return command;
855 }
856
857 static void DPSOFTRAST_UndoCommand(int size)
858 {
859         int freecommand = dpsoftrast.commandpool.freecommand;
860         int usedcommands = dpsoftrast.commandpool.usedcommands;
861         freecommand -= size;
862         if (freecommand < 0)
863                 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
864         usedcommands -= size;
865         dpsoftrast.commandpool.freecommand = freecommand;
866         dpsoftrast.commandpool.usedcommands = usedcommands;
867 }
868                 
869 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
870 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
871 {
872         thread->viewport[0] = command->x;
873         thread->viewport[1] = command->y;
874         thread->viewport[2] = command->width;
875         thread->viewport[3] = command->height;
876         thread->validate |= DPSOFTRAST_VALIDATE_FB;
877 }
878 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
879 {
880         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
881         command->x = x;
882         command->y = y;
883         command->width = width;
884         command->height = height;
885
886         dpsoftrast.viewport[0] = x;
887         dpsoftrast.viewport[1] = y;
888         dpsoftrast.viewport[2] = width;
889         dpsoftrast.viewport[3] = height;
890         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
891 }
892
893 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
894 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
895 {
896         int i, x1, y1, x2, y2, w, h, x, y;
897         int miny1 = thread->miny1;
898         int maxy1 = thread->maxy1;
899         int miny2 = thread->miny2;
900         int maxy2 = thread->maxy2;
901         int bandy;
902         unsigned int *p;
903         unsigned int c;
904         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
905         x1 = thread->fb_scissor[0];
906         y1 = thread->fb_scissor[1];
907         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
908         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
909         if (y1 < miny1) y1 = miny1;
910         if (y2 > maxy2) y2 = maxy2;
911         w = x2 - x1;
912         h = y2 - y1;
913         if (w < 1 || h < 1)
914                 return;
915         // FIXME: honor fb_colormask?
916         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
917         for (i = 0;i < 4;i++)
918         {
919                 if (!dpsoftrast.fb_colorpixels[i])
920                         continue;
921                 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
922                 for (;y < bandy;y++)
923                 {
924                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
925                         for (x = x1;x < x2;x++)
926                                 p[x] = c;
927                 }
928         }
929 }
930 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
931 {
932         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
933         command->r = r;
934         command->g = g;
935         command->b = b;
936         command->a = a;
937 }
938
939 DEFCOMMAND(3, ClearDepth, float depth;)
940 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
941 {
942         int x1, y1, x2, y2, w, h, x, y;
943         int miny1 = thread->miny1;
944         int maxy1 = thread->maxy1;
945         int miny2 = thread->miny2;
946         int maxy2 = thread->maxy2;
947         int bandy;
948         unsigned int *p;
949         unsigned int c;
950         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
951         x1 = thread->fb_scissor[0];
952         y1 = thread->fb_scissor[1];
953         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
954         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
955         if (y1 < miny1) y1 = miny1;
956         if (y2 > maxy2) y2 = maxy2;
957         w = x2 - x1;
958         h = y2 - y1;
959         if (w < 1 || h < 1)
960                 return;
961         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
962         for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
963         for (;y < bandy;y++)
964         {
965                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
966                 for (x = x1;x < x2;x++)
967                         p[x] = c;
968         }
969 }
970 void DPSOFTRAST_ClearDepth(float d)
971 {
972         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
973         command->depth = d;
974 }
975
976 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
977 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
978 {
979         thread->colormask[0] = command->r != 0;
980         thread->colormask[1] = command->g != 0;
981         thread->colormask[2] = command->b != 0;
982         thread->colormask[3] = command->a != 0;
983         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
984 }
985 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
986 {
987         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
988         command->r = r;
989         command->g = g;
990         command->b = b;
991         command->a = a;
992 }
993
994 DEFCOMMAND(5, DepthTest, int enable;)
995 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
996 {
997         thread->depthtest = command->enable;
998         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
999 }
1000 void DPSOFTRAST_DepthTest(int enable)
1001 {
1002         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1003         command->enable = enable;
1004 }
1005
1006 DEFCOMMAND(6, ScissorTest, int enable;)
1007 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1008 {
1009         thread->scissortest = command->enable;
1010         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1011 }
1012 void DPSOFTRAST_ScissorTest(int enable)
1013 {
1014         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1015         command->enable = enable;
1016 }
1017
1018 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1019 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1020 {
1021         thread->scissor[0] = command->x;
1022         thread->scissor[1] = command->y;
1023         thread->scissor[2] = command->width;
1024         thread->scissor[3] = command->height;
1025         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1026 }
1027 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1028 {
1029         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1030         command->x = x;
1031         command->y = y;
1032         command->width = width;
1033         command->height = height;
1034 }
1035
1036 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1037 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1038 {
1039         thread->blendfunc[0] = command->sfactor;
1040         thread->blendfunc[1] = command->dfactor;
1041         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1042 }
1043 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1044 {
1045         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1046         command->sfactor = sfactor;
1047         command->dfactor = dfactor;
1048 }
1049
1050 DEFCOMMAND(9, BlendSubtract, int enable;)
1051 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1052 {
1053         thread->blendsubtract = command->enable;
1054         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1055 }
1056 void DPSOFTRAST_BlendSubtract(int enable)
1057 {
1058         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1059         command->enable = enable;
1060 }
1061
1062 DEFCOMMAND(10, DepthMask, int enable;)
1063 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1064 {
1065         thread->depthmask = command->enable;
1066 }
1067 void DPSOFTRAST_DepthMask(int enable)
1068 {
1069         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1070         command->enable = enable;
1071 }
1072
1073 DEFCOMMAND(11, DepthFunc, int func;)
1074 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1075 {
1076         thread->depthfunc = command->func;
1077 }
1078 void DPSOFTRAST_DepthFunc(int func)
1079 {
1080         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1081         command->func = func;
1082 }
1083
1084 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1085 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1086 {
1087         thread->depthrange[0] = command->nearval;
1088         thread->depthrange[1] = command->farval;
1089 }
1090 void DPSOFTRAST_DepthRange(float nearval, float farval)
1091 {
1092         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1093         command->nearval = nearval;
1094         command->farval = farval;
1095 }
1096
1097 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1098 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1099 {
1100         thread->polygonoffset[0] = command->alongnormal;
1101         thread->polygonoffset[1] = command->intoview;
1102 }
1103 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1104 {
1105         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1106         command->alongnormal = alongnormal;
1107         command->intoview = intoview;
1108 }
1109
1110 DEFCOMMAND(14, CullFace, int mode;)
1111 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1112 {
1113         thread->cullface = command->mode;
1114 }
1115 void DPSOFTRAST_CullFace(int mode)
1116 {
1117         DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1118         command->mode = mode;
1119 }
1120
1121 DEFCOMMAND(15, AlphaTest, int enable;)
1122 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1123 {
1124         thread->alphatest = command->enable;
1125 }
1126 void DPSOFTRAST_AlphaTest(int enable)
1127 {
1128         DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1129         command->enable = enable;
1130 }
1131
1132 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1133 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1134 {
1135         thread->alphafunc = command->func;
1136         thread->alphavalue = command->ref;
1137 }
1138 void DPSOFTRAST_AlphaFunc(int func, float ref)
1139 {
1140         DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1141         command->func = func;
1142         command->ref = ref;
1143 }
1144
1145 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1146 {
1147         dpsoftrast.color[0] = r;
1148         dpsoftrast.color[1] = g;
1149         dpsoftrast.color[2] = b;
1150         dpsoftrast.color[3] = a;
1151 }
1152
1153 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1154 {
1155         int outstride = blockwidth * 4;
1156         int instride = dpsoftrast.fb_width * 4;
1157         int bx1 = blockx;
1158         int by1 = blocky;
1159         int bx2 = blockx + blockwidth;
1160         int by2 = blocky + blockheight;
1161         int bw;
1162         int x;
1163         int y;
1164         unsigned char *inpixels;
1165         unsigned char *b;
1166         unsigned char *o;
1167         DPSOFTRAST_Flush();
1168         if (bx1 < 0) bx1 = 0;
1169         if (by1 < 0) by1 = 0;
1170         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1171         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1172         bw = bx2 - bx1;
1173         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1174         if (dpsoftrast.bigendian)
1175         {
1176                 for (y = by1;y < by2;y++)
1177                 {
1178                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1179                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1180                         for (x = bx1;x < bx2;x++)
1181                         {
1182                                 o[0] = b[3];
1183                                 o[1] = b[2];
1184                                 o[2] = b[1];
1185                                 o[3] = b[0];
1186                                 o += 4;
1187                                 b += 4;
1188                         }
1189                 }
1190         }
1191         else
1192         {
1193                 for (y = by1;y < by2;y++)
1194                 {
1195                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1196                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1197                         memcpy(o, b, bw*4);
1198                 }
1199         }
1200
1201 }
1202 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1203 {
1204         int tx1 = tx;
1205         int ty1 = ty;
1206         int tx2 = tx + width;
1207         int ty2 = ty + height;
1208         int sx1 = sx;
1209         int sy1 = sy;
1210         int sx2 = sx + width;
1211         int sy2 = sy + height;
1212         int swidth;
1213         int sheight;
1214         int twidth;
1215         int theight;
1216         int sw;
1217         int sh;
1218         int tw;
1219         int th;
1220         int y;
1221         unsigned int *spixels;
1222         unsigned int *tpixels;
1223         DPSOFTRAST_Texture *texture;
1224         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1225         if (mip < 0 || mip >= texture->mipmaps) return;
1226         DPSOFTRAST_Flush();
1227         spixels = dpsoftrast.fb_colorpixels[0];
1228         swidth = dpsoftrast.fb_width;
1229         sheight = dpsoftrast.fb_height;
1230         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1231         twidth = texture->mipmap[mip][2];
1232         theight = texture->mipmap[mip][3];
1233         if (tx1 < 0) tx1 = 0;
1234         if (ty1 < 0) ty1 = 0;
1235         if (tx2 > twidth) tx2 = twidth;
1236         if (ty2 > theight) ty2 = theight;
1237         if (sx1 < 0) sx1 = 0;
1238         if (sy1 < 0) sy1 = 0;
1239         if (sx2 > swidth) sx2 = swidth;
1240         if (sy2 > sheight) sy2 = sheight;
1241         tw = tx2 - tx1;
1242         th = ty2 - ty1;
1243         sw = sx2 - sx1;
1244         sh = sy2 - sy1;
1245         if (tw > sw) tw = sw;
1246         if (th > sh) th = sh;
1247         if (tw < 1 || th < 1)
1248                 return;
1249         sy1 = sheight - 1 - sy1;
1250         for (y = 0;y < th;y++)
1251                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1252         if (texture->mipmaps > 1)
1253                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1254 }
1255
1256 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1257 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1258 {
1259         if (thread->texbound[command->unitnum])
1260                 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1261         thread->texbound[command->unitnum] = command->texture;
1262 }
1263 void DPSOFTRAST_SetTexture(int unitnum, int index)
1264 {
1265         DPSOFTRAST_Command_SetTexture *command;
1266         DPSOFTRAST_Texture *texture;
1267         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1268         {
1269                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1270                 return;
1271         }
1272         texture = DPSOFTRAST_Texture_GetByIndex(index);
1273         if (index && !texture)
1274         {
1275                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1276                 return;
1277         }
1278
1279         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1280         command->unitnum = unitnum;
1281         command->texture = texture;
1282
1283         dpsoftrast.texbound[unitnum] = texture;
1284         ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1285 }
1286
1287 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1288 {
1289         dpsoftrast.pointer_vertex3f = vertex3f;
1290         dpsoftrast.stride_vertex = stride;
1291 }
1292 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1293 {
1294         dpsoftrast.pointer_color4f = color4f;
1295         dpsoftrast.pointer_color4ub = NULL;
1296         dpsoftrast.stride_color = stride;
1297 }
1298 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1299 {
1300         dpsoftrast.pointer_color4f = NULL;
1301         dpsoftrast.pointer_color4ub = color4ub;
1302         dpsoftrast.stride_color = stride;
1303 }
1304 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1305 {
1306         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1307         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1308         dpsoftrast.stride_texcoord[unitnum] = stride;
1309 }
1310
1311 DEFCOMMAND(18, SetShader, int mode; int permutation;)
1312 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1313 {
1314         thread->shader_mode = command->mode;
1315         thread->shader_permutation = command->permutation;
1316 }
1317 void DPSOFTRAST_SetShader(int mode, int permutation)
1318 {
1319         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1320         command->mode = mode;
1321         command->permutation = permutation;
1322
1323         dpsoftrast.shader_mode = mode;
1324         dpsoftrast.shader_permutation = permutation;
1325 }
1326
1327 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1328 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1329 {
1330         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1331 }
1332 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1333 {
1334         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1335         command->index = index;
1336         command->val[0] = v0;
1337         command->val[1] = v1;
1338         command->val[2] = v2;
1339         command->val[3] = v3;
1340
1341         dpsoftrast.uniform4f[index*4+0] = v0;
1342         dpsoftrast.uniform4f[index*4+1] = v1;
1343         dpsoftrast.uniform4f[index*4+2] = v2;
1344         dpsoftrast.uniform4f[index*4+3] = v3;
1345 }
1346 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1347 {
1348         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1349         command->index = index;
1350         memcpy(command->val, v, sizeof(command->val));
1351
1352         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1353 }
1354
1355 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1356 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1357 {
1358         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1359 }
1360 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1361 {
1362 #ifdef SSE2_PRESENT
1363         int i, index;
1364         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1365         {
1366                 __m128 m0, m1, m2, m3;
1367                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1368                 command->index = (DPSOFTRAST_UNIFORM)index;
1369                 if (((size_t)v)&(ALIGN_SIZE-1))
1370                 {
1371                         m0 = _mm_loadu_ps(v);
1372                         m1 = _mm_loadu_ps(v+4);
1373                         m2 = _mm_loadu_ps(v+8);
1374                         m3 = _mm_loadu_ps(v+12);
1375                 }
1376                 else
1377                 {
1378                         m0 = _mm_load_ps(v);
1379                         m1 = _mm_load_ps(v+4);
1380                         m2 = _mm_load_ps(v+8);
1381                         m3 = _mm_load_ps(v+12);
1382                 }
1383                 if (transpose)
1384                 {
1385                         __m128 t0, t1, t2, t3;
1386                         t0 = _mm_unpacklo_ps(m0, m1);
1387                         t1 = _mm_unpacklo_ps(m2, m3);
1388                         t2 = _mm_unpackhi_ps(m0, m1);
1389                         t3 = _mm_unpackhi_ps(m2, m3);
1390                         m0 = _mm_movelh_ps(t0, t1);
1391                         m1 = _mm_movehl_ps(t1, t0);
1392                         m2 = _mm_movelh_ps(t2, t3);
1393                         m3 = _mm_movehl_ps(t3, t2);                     
1394                 }
1395                 _mm_store_ps(command->val, m0);
1396                 _mm_store_ps(command->val+4, m1);
1397                 _mm_store_ps(command->val+8, m2);
1398                 _mm_store_ps(command->val+12, m3);
1399                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1400                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1401                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1402                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1403         }
1404 #endif
1405 }
1406
1407 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1408 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1409 {
1410         thread->uniform1i[command->index] = command->val;
1411 }
1412 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1413 {
1414         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1415         command->index = index;
1416         command->val = i0;
1417
1418         dpsoftrast.uniform1i[command->index] = i0;
1419 }
1420
1421 #ifdef SSE2_PRESENT
1422 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1423 {
1424         float *end = dst + size*4;
1425         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1426         {
1427                 while (dst < end)
1428                 {
1429                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1430                         dst += 4;
1431                         src += stride;
1432                 }
1433         }
1434         else
1435         {
1436                 while (dst < end)
1437                 {
1438                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1439                         dst += 4;
1440                         src += stride;
1441                 }
1442         }
1443 }
1444
1445 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1446 {
1447         float *end = dst + size*4;
1448         if (stride == sizeof(float[3]))
1449         {
1450                 float *end4 = dst + (size&~3)*4;        
1451                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1452                 {
1453                         while (dst < end4)
1454                         {
1455                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1456                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1457                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1458                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1459                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1460                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1461                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1462                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1463                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1464                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1465                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1466                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1467                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1468                                 dst += 16;
1469                                 src += 4*sizeof(float[3]);
1470                         }
1471                 }
1472                 else
1473                 {
1474                         while (dst < end4)
1475                         {
1476                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1477                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1478                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1479                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1480                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1481                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1482                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1483                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1484                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1485                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1486                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1487                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1488                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1489                                 dst += 16;
1490                                 src += 4*sizeof(float[3]);
1491                         }
1492                 }
1493         }
1494         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1495         {
1496                 while (dst < end)
1497                 {
1498                         __m128 v = _mm_loadu_ps((const float *)src);
1499                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1500                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1501                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1502                         _mm_store_ps(dst, v);
1503                         dst += 4;
1504                         src += stride;
1505                 }
1506         }
1507         else
1508         {
1509                 while (dst < end)
1510                 {
1511                         __m128 v = _mm_load_ps((const float *)src);
1512                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1513                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1514                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1515                         _mm_store_ps(dst, v);
1516                         dst += 4;
1517                         src += stride;
1518                 }
1519         }
1520 }
1521
1522 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1523 {
1524         float *end = dst + size*4;
1525         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1526         if (stride == sizeof(float[2]))
1527         {
1528                 float *end2 = dst + (size&~1)*4;
1529                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1530                 {
1531                         while (dst < end2)
1532                         {
1533                                 __m128 v = _mm_loadu_ps((const float *)src);
1534                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1535                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1536                                 dst += 8;
1537                                 src += 2*sizeof(float[2]);
1538                         }
1539                 }
1540                 else
1541                 {
1542                         while (dst < end2)
1543                         {
1544                                 __m128 v = _mm_load_ps((const float *)src);
1545                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1546                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1547                                 dst += 8;
1548                                 src += 2*sizeof(float[2]);
1549                         }
1550                 }
1551         }
1552         while (dst < end)
1553         {
1554                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1555                 dst += 4;
1556                 src += stride;
1557         }
1558 }
1559
1560 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1561 {
1562         float *end = dst + size*4;
1563         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1564         if (stride == sizeof(unsigned char[4]))
1565         {
1566                 float *end4 = dst + (size&~3)*4;
1567                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1568                 {
1569                         while (dst < end4)
1570                         {
1571                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1572                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1573                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1574                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1575                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1576                                 dst += 16;
1577                                 src += 4*sizeof(unsigned char[4]);
1578                         }
1579                 }
1580                 else
1581                 {
1582                         while (dst < end4)
1583                         {
1584                                 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1585                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1586                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1587                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1588                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1589                                 dst += 16;
1590                                 src += 4*sizeof(unsigned char[4]);
1591                         }
1592                 }
1593         }
1594         while (dst < end)
1595         {
1596                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1597                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1598                 dst += 4;
1599                 src += stride;
1600         }
1601 }
1602
1603 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1604 {
1605         float *end = dst + 4*size;
1606         __m128 v = _mm_loadu_ps(src);
1607         while (dst < end)
1608         {
1609                 _mm_store_ps(dst, v);
1610                 dst += 4;
1611         }
1612 }
1613 #endif
1614
1615 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1616 {
1617 #ifdef SSE2_PRESENT
1618         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1619         __m128 m0, m1, m2, m3;
1620         float *end;
1621         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1622         {
1623                 // fast case for identity matrix
1624                 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1625                 return;
1626         }
1627         end = out4f + numitems*4;
1628         m0 = _mm_loadu_ps(inmatrix16f);
1629         m1 = _mm_loadu_ps(inmatrix16f + 4);
1630         m2 = _mm_loadu_ps(inmatrix16f + 8);
1631         m3 = _mm_loadu_ps(inmatrix16f + 12);
1632         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1633         {
1634                 while (out4f < end)
1635                 {
1636                         __m128 v = _mm_loadu_ps(in4f);
1637                         _mm_store_ps(out4f,
1638                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1639                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1640                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1641                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1642                         out4f += 4;
1643                         in4f += 4;
1644                 }
1645         }
1646         else
1647         {
1648                 while (out4f < end)
1649                 {
1650                         __m128 v = _mm_load_ps(in4f);
1651                         _mm_store_ps(out4f,
1652                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1653                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1654                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1655                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1656                         out4f += 4;
1657                         in4f += 4;
1658                 }
1659         }
1660 #endif
1661 }
1662
1663 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1664 {
1665         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1666 }
1667
1668 #ifdef SSE2_PRESENT
1669 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1670 { \
1671         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1672         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1673         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1674         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1675 }
1676
1677 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1678 { \
1679         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1680         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1681         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1682         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1683 }
1684
1685 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1686 { \
1687         __m128 p = (in); \
1688         out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1689                                                   _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1690                                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1691                                                                                         _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1692 }
1693
1694 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, __m128 minpos, __m128 maxpos, __m128 viewportcenter, __m128 viewportscale, __m128 m0, __m128 m1, __m128 m2, __m128 m3)
1695 {
1696         int clipmask = 0xFF;
1697         __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1698         m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1699         m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1700         m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1701         m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1702         #define BBFRONT(k, pos) \
1703         { \
1704                 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1705                 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1706                 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1707                 { \
1708                         __m128 proj; \
1709                         clipmask &= ~(1<<k); \
1710                         proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1711                         minproj = _mm_min_ss(minproj, proj); \
1712                         maxproj = _mm_max_ss(maxproj, proj); \
1713                 } \
1714         }
1715         BBFRONT(0, minpos); 
1716         BBFRONT(1, _mm_move_ss(minpos, maxpos)); 
1717         BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1718         BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1719         BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1720         BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1721         BBFRONT(6, _mm_move_ss(maxpos, minpos)); 
1722         BBFRONT(7, maxpos);
1723         #define BBCLIP(k) \
1724         { \
1725                 if (clipmask&(1<<k)) \
1726                 { \
1727                         if (!(clipmask&(1<<(k^1)))) \
1728                         { \
1729                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1730                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1731                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1732                                 minproj = _mm_min_ss(minproj, proj); \
1733                                 maxproj = _mm_max_ss(maxproj, proj); \
1734                         } \
1735                         if (!(clipmask&(1<<(k^2)))) \
1736                         { \
1737                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1738                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1739                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1740                                 minproj = _mm_min_ss(minproj, proj); \
1741                                 maxproj = _mm_max_ss(maxproj, proj); \
1742                         } \
1743                         if (!(clipmask&(1<<(k^4)))) \
1744                         { \
1745                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1746                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1747                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1748                                 minproj = _mm_min_ss(minproj, proj); \
1749                                 maxproj = _mm_max_ss(maxproj, proj); \
1750                         } \
1751                 } \
1752         }
1753         BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1754         viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1755         viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1756         minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1757         maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1758         minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1759         maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1760         *starty = _mm_cvttss_si32(maxproj);
1761         *endy = _mm_cvttss_si32(minproj)+1;
1762         return clipmask;
1763 }
1764         
1765 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1766 {
1767         float *end = out4f + numitems*4;
1768         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1769         __m128 minpos, maxpos;
1770         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1771         {
1772                 minpos = maxpos = _mm_loadu_ps(in4f);
1773                 while (out4f < end)
1774                 {
1775                         __m128 v = _mm_loadu_ps(in4f);
1776                         minpos = _mm_min_ps(minpos, v);
1777                         maxpos = _mm_max_ps(maxpos, v);
1778                         _mm_store_ps(out4f, v);
1779                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1780                         _mm_store_ps(screen4f, v);
1781                         in4f += 4;
1782                         out4f += 4;
1783                         screen4f += 4;
1784                 }
1785         }
1786         else
1787         {
1788                 minpos = maxpos = _mm_load_ps(in4f);
1789                 while (out4f < end)
1790                 {
1791                         __m128 v = _mm_load_ps(in4f);
1792                         minpos = _mm_min_ps(minpos, v);
1793                         maxpos = _mm_max_ps(maxpos, v);
1794                         _mm_store_ps(out4f, v);
1795                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1796                         _mm_store_ps(screen4f, v);
1797                         in4f += 4;
1798                         out4f += 4;
1799                         screen4f += 4;
1800                 }
1801         }
1802         if (starty && endy) 
1803                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, 
1804                                         _mm_setr_ps(1.0f, 0.0f, 0.0f, 0.0f),
1805                                         _mm_setr_ps(0.0f, 1.0f, 0.0f, 0.0f),
1806                                         _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f),
1807                                         _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f));
1808         return 0;
1809 }
1810
1811 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1812 {
1813         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1814         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1815         float *end;
1816         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1817                 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1818         end = out4f + numitems*4;
1819         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1820         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1821         m0 = _mm_loadu_ps(inmatrix16f);
1822         m1 = _mm_loadu_ps(inmatrix16f + 4);
1823         m2 = _mm_loadu_ps(inmatrix16f + 8);
1824         m3 = _mm_loadu_ps(inmatrix16f + 12);
1825         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1826         {
1827                 minpos = maxpos = _mm_loadu_ps(in4f);
1828                 while (out4f < end)
1829                 {
1830                         __m128 v = _mm_loadu_ps(in4f);
1831                         minpos = _mm_min_ps(minpos, v);
1832                         maxpos = _mm_max_ps(maxpos, v);
1833                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1834                         _mm_store_ps(out4f, v);
1835                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1836                         _mm_store_ps(screen4f, v);
1837                         in4f += 4;
1838                         out4f += 4;
1839                         screen4f += 4;
1840                 }
1841         }
1842         else
1843         {
1844                 minpos = maxpos = _mm_load_ps(in4f);
1845                 while (out4f < end)
1846                 {
1847                         __m128 v = _mm_load_ps(in4f);
1848                         minpos = _mm_min_ps(minpos, v);
1849                         maxpos = _mm_max_ps(maxpos, v);
1850                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1851                         _mm_store_ps(out4f, v);
1852                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1853                         _mm_store_ps(screen4f, v);
1854                         in4f += 4;
1855                         out4f += 4;
1856                         screen4f += 4;
1857                 }
1858         }
1859         if (starty && endy) 
1860                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, m0, m1, m2, m3); 
1861         return 0;
1862 }
1863 #endif
1864
1865 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1866 {
1867 #ifdef SSE2_PRESENT
1868         float *outf = dpsoftrast.post_array4f[outarray];
1869         const unsigned char *inb;
1870         int firstvertex = dpsoftrast.firstvertex;
1871         int numvertices = dpsoftrast.numvertices;
1872         int stride;
1873         switch(inarray)
1874         {
1875         case DPSOFTRAST_ARRAY_POSITION:
1876                 stride = dpsoftrast.stride_vertex;
1877                 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1878                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1879                 break;
1880         case DPSOFTRAST_ARRAY_COLOR:
1881                 stride = dpsoftrast.stride_color;
1882                 if (dpsoftrast.pointer_color4f)
1883                 {
1884                         inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1885                         DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1886                 }
1887                 else if (dpsoftrast.pointer_color4ub)
1888                 {
1889                         stride = dpsoftrast.stride_color;
1890                         inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1891                         DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1892                 }
1893                 else
1894                 {
1895                         DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1896                 }
1897                 break;
1898         default:
1899                 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1900                 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1901                 {
1902                         inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1903                         switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1904                         {
1905                         case 2:
1906                                 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1907                                 break;
1908                         case 3:
1909                                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1910                                 break;
1911                         case 4:
1912                                 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1913                                 break;
1914                         }
1915                 }
1916                 break;
1917         }
1918         return outf;
1919 #else
1920         return NULL;
1921 #endif
1922 }
1923
1924 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1925 {
1926         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1927         DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1928         return data;
1929 }
1930
1931 #if 0
1932 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1933 {
1934 #ifdef SSE2_PRESENT
1935         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1936         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1937         return data;
1938 #else
1939         return NULL;
1940 #endif
1941 }
1942 #endif
1943
1944 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1945 {
1946 #ifdef SSE2_PRESENT
1947         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1948         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
1949         return data;
1950 #else
1951         return NULL;
1952 #endif
1953 }
1954
1955 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1956 {
1957         int x;
1958         int startx = span->startx;
1959         int endx = span->endx;
1960         float wslope = triangle->w[0];
1961         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1962         float endz = 1.0f / (w + wslope * startx);
1963         for (x = startx;x < endx;)
1964         {
1965                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1966                 float z = endz, dz;
1967                 if (nextsub >= endx) nextsub = endsub = endx-1;
1968                 endz = 1.0f / (w + wslope * nextsub);
1969                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1970                 for (; x <= endsub; x++, z += dz)
1971                         zf[x] = z;
1972         }
1973 }
1974
1975 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
1976 {
1977         int x;
1978         int startx = span->startx;
1979         int endx = span->endx;
1980         int d[4];
1981         float a, b;
1982         unsigned char * RESTRICT pixelmask = span->pixelmask;
1983         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1984         if (!pixel)
1985                 return;
1986         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
1987         // handle alphatest now (this affects depth writes too)
1988         if (thread->alphatest)
1989                 for (x = startx;x < endx;x++)
1990                         if (in4f[x*4+3] < 0.5f)
1991                                 pixelmask[x] = false;
1992         // FIXME: this does not handle bigendian
1993         switch(thread->fb_blendmode)
1994         {
1995         case DPSOFTRAST_BLENDMODE_OPAQUE:
1996                 for (x = startx;x < endx;x++)
1997                 {
1998                         if (!pixelmask[x])
1999                                 continue;
2000                         d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
2001                         d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
2002                         d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
2003                         d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
2004                         pixel[x*4+0] = d[0];
2005                         pixel[x*4+1] = d[1];
2006                         pixel[x*4+2] = d[2];
2007                         pixel[x*4+3] = d[3];
2008                 }
2009                 break;
2010         case DPSOFTRAST_BLENDMODE_ALPHA:
2011                 for (x = startx;x < endx;x++)
2012                 {
2013                         if (!pixelmask[x])
2014                                 continue;
2015                         a = in4f[x*4+3] * 255.0f;
2016                         b = 1.0f - in4f[x*4+3];
2017                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2018                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2019                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2020                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2021                         pixel[x*4+0] = d[0];
2022                         pixel[x*4+1] = d[1];
2023                         pixel[x*4+2] = d[2];
2024                         pixel[x*4+3] = d[3];
2025                 }
2026                 break;
2027         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2028                 for (x = startx;x < endx;x++)
2029                 {
2030                         if (!pixelmask[x])
2031                                 continue;
2032                         a = in4f[x*4+3] * 255.0f;
2033                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2034                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2035                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2036                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2037                         pixel[x*4+0] = d[0];
2038                         pixel[x*4+1] = d[1];
2039                         pixel[x*4+2] = d[2];
2040                         pixel[x*4+3] = d[3];
2041                 }
2042                 break;
2043         case DPSOFTRAST_BLENDMODE_ADD:
2044                 for (x = startx;x < endx;x++)
2045                 {
2046                         if (!pixelmask[x])
2047                                 continue;
2048                         d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2049                         d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2050                         d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2051                         d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2052                         pixel[x*4+0] = d[0];
2053                         pixel[x*4+1] = d[1];
2054                         pixel[x*4+2] = d[2];
2055                         pixel[x*4+3] = d[3];
2056                 }
2057                 break;
2058         case DPSOFTRAST_BLENDMODE_INVMOD:
2059                 for (x = startx;x < endx;x++)
2060                 {
2061                         if (!pixelmask[x])
2062                                 continue;
2063                         d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2064                         d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2065                         d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2066                         d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2067                         pixel[x*4+0] = d[0];
2068                         pixel[x*4+1] = d[1];
2069                         pixel[x*4+2] = d[2];
2070                         pixel[x*4+3] = d[3];
2071                 }
2072                 break;
2073         case DPSOFTRAST_BLENDMODE_MUL:
2074                 for (x = startx;x < endx;x++)
2075                 {
2076                         if (!pixelmask[x])
2077                                 continue;
2078                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2079                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2080                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2081                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2082                         pixel[x*4+0] = d[0];
2083                         pixel[x*4+1] = d[1];
2084                         pixel[x*4+2] = d[2];
2085                         pixel[x*4+3] = d[3];
2086                 }
2087                 break;
2088         case DPSOFTRAST_BLENDMODE_MUL2:
2089                 for (x = startx;x < endx;x++)
2090                 {
2091                         if (!pixelmask[x])
2092                                 continue;
2093                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2094                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2095                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2096                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2097                         pixel[x*4+0] = d[0];
2098                         pixel[x*4+1] = d[1];
2099                         pixel[x*4+2] = d[2];
2100                         pixel[x*4+3] = d[3];
2101                 }
2102                 break;
2103         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2104                 for (x = startx;x < endx;x++)
2105                 {
2106                         if (!pixelmask[x])
2107                                 continue;
2108                         a = in4f[x*4+3] * -255.0f;
2109                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2110                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2111                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2112                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2113                         pixel[x*4+0] = d[0];
2114                         pixel[x*4+1] = d[1];
2115                         pixel[x*4+2] = d[2];
2116                         pixel[x*4+3] = d[3];
2117                 }
2118                 break;
2119         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2120                 for (x = startx;x < endx;x++)
2121                 {
2122                         if (!pixelmask[x])
2123                                 continue;
2124                         a = 255.0f;
2125                         b = 1.0f - in4f[x*4+3];
2126                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2127                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2128                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2129                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2130                         pixel[x*4+0] = d[0];
2131                         pixel[x*4+1] = d[1];
2132                         pixel[x*4+2] = d[2];
2133                         pixel[x*4+3] = d[3];
2134                 }
2135                 break;
2136         case DPSOFTRAST_BLENDMODE_INVADD:
2137                 for (x = startx;x < endx;x++)
2138                 {
2139                         if (!pixelmask[x])
2140                                 continue;
2141                         d[0] = (int)((255.0f-pixel[x*4+2])*in4f[x*4+0] + pixel[x*4+2]);if (d[0] > 255) d[0] = 255;
2142                         d[1] = (int)((255.0f-pixel[x*4+1])*in4f[x*4+1] + pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2143                         d[2] = (int)((255.0f-pixel[x*4+0])*in4f[x*4+2] + pixel[x*4+0]);if (d[2] > 255) d[2] = 255;
2144                         d[3] = (int)((255.0f-pixel[x*4+3])*in4f[x*4+3] + pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2145                         pixel[x*4+0] = d[0];
2146                         pixel[x*4+1] = d[1];
2147                         pixel[x*4+2] = d[2];
2148                         pixel[x*4+3] = d[3];
2149                 }
2150                 break;
2151         }
2152 }
2153
2154 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2155 {
2156 #ifdef SSE2_PRESENT
2157         int x;
2158         int startx = span->startx;
2159         int endx = span->endx;
2160         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2161         unsigned char * RESTRICT pixelmask = span->pixelmask;
2162         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2163         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2164         if (!pixel)
2165                 return;
2166         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2167         pixeli += span->y * dpsoftrast.fb_width + span->x;
2168         // handle alphatest now (this affects depth writes too)
2169         if (thread->alphatest)
2170                 for (x = startx;x < endx;x++)
2171                         if (in4ub[x*4+3] < 0.5f)
2172                                 pixelmask[x] = false;
2173         // FIXME: this does not handle bigendian
2174         switch(thread->fb_blendmode)
2175         {
2176         case DPSOFTRAST_BLENDMODE_OPAQUE:
2177                 for (x = startx;x + 4 <= endx;)
2178                 {
2179                         if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2180                         {
2181                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2182                                 x += 4;
2183                         }
2184                         else
2185                         {
2186                                 if (pixelmask[x])
2187                                         pixeli[x] = ini[x];
2188                                 x++;
2189                         }
2190                 }
2191                 for (;x < endx;x++)
2192                         if (pixelmask[x])
2193                                 pixeli[x] = ini[x];
2194                 break;
2195         case DPSOFTRAST_BLENDMODE_ALPHA:
2196         #define FINISHBLEND(blend2, blend1) \
2197                 for (x = startx;x + 1 < endx;x += 2) \
2198                 { \
2199                         __m128i src, dst; \
2200                         switch (*(const unsigned short*)&pixelmask[x]) \
2201                         { \
2202                         case 0x0101: \
2203                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2204                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2205                                 blend2; \
2206                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2207                                 continue; \
2208                         case 0x0100: \
2209                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2210                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2211                                 blend1; \
2212                                 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst));  \
2213                                 continue; \
2214                         case 0x0001: \
2215                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2216                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2217                                 blend1; \
2218                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2219                                 continue; \
2220                         } \
2221                         break; \
2222                 } \
2223                 for(;x < endx; x++) \
2224                 { \
2225                         __m128i src, dst; \
2226                         if (!pixelmask[x]) \
2227                                 continue; \
2228                         src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2229                         dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2230                         blend1; \
2231                         pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2232                 }
2233
2234                 FINISHBLEND({
2235                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2236                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2237                 }, {
2238                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2239                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2240                 });
2241                 break;
2242         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2243                 FINISHBLEND({
2244                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2245                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2246                 }, {
2247                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2248                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2249                 });
2250                 break;
2251         case DPSOFTRAST_BLENDMODE_ADD:
2252                 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2253                 break;
2254         case DPSOFTRAST_BLENDMODE_INVMOD:
2255                 FINISHBLEND({
2256                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2257                 }, {
2258                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2259                 });
2260                 break;
2261         case DPSOFTRAST_BLENDMODE_MUL:
2262                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2263                 break;
2264         case DPSOFTRAST_BLENDMODE_MUL2:
2265                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2266                 break;
2267         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2268                 FINISHBLEND({
2269                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2270                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2271                 }, {
2272                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2273                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2274                 });
2275                 break;
2276         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2277                 FINISHBLEND({
2278                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2279                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2280                 }, {
2281                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2282                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2283                 });
2284                 break;
2285         case DPSOFTRAST_BLENDMODE_INVADD:
2286                 FINISHBLEND({
2287                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2288                 }, {
2289                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2290                 });
2291                 break;
2292         }
2293 #endif
2294 }
2295
2296 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2297 {
2298         int x;
2299         int startx = span->startx;
2300         int endx = span->endx;
2301         int flags;
2302         float c[4];
2303         float data[4];
2304         float slope[4];
2305         float tc[2], endtc[2];
2306         float tcscale[2];
2307         unsigned int tci[2];
2308         unsigned int tci1[2];
2309         unsigned int tcimin[2];
2310         unsigned int tcimax[2];
2311         int tciwrapmask[2];
2312         int tciwidth;
2313         int filter;
2314         int mip;
2315         const unsigned char * RESTRICT pixelbase;
2316         const unsigned char * RESTRICT pixel[4];
2317         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2318         // if no texture is bound, just fill it with white
2319         if (!texture)
2320         {
2321                 for (x = startx;x < endx;x++)
2322                 {
2323                         out4f[x*4+0] = 1.0f;
2324                         out4f[x*4+1] = 1.0f;
2325                         out4f[x*4+2] = 1.0f;
2326                         out4f[x*4+3] = 1.0f;
2327                 }
2328                 return;
2329         }
2330         mip = triangle->mip[texunitindex];
2331         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2332         // if this mipmap of the texture is 1 pixel, just fill it with that color
2333         if (texture->mipmap[mip][1] == 4)
2334         {
2335                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2336                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2337                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2338                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2339                 for (x = startx;x < endx;x++)
2340                 {
2341                         out4f[x*4+0] = c[0];
2342                         out4f[x*4+1] = c[1];
2343                         out4f[x*4+2] = c[2];
2344                         out4f[x*4+3] = c[3];
2345                 }
2346                 return;
2347         }
2348         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2349         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2350         flags = texture->flags;
2351         tcscale[0] = texture->mipmap[mip][2];
2352         tcscale[1] = texture->mipmap[mip][3];
2353         tciwidth = texture->mipmap[mip][2];
2354         tcimin[0] = 0;
2355         tcimin[1] = 0;
2356         tcimax[0] = texture->mipmap[mip][2]-1;
2357         tcimax[1] = texture->mipmap[mip][3]-1;
2358         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2359         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2360         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
2361         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
2362         for (x = startx;x < endx;)
2363         {
2364                 unsigned int subtc[2];
2365                 unsigned int substep[2];
2366                 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2367                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2368                 if (nextsub >= endx)
2369                 {
2370                         nextsub = endsub = endx-1;      
2371                         if (x < nextsub) subscale = 65536.0f / (nextsub - x);
2372                 }
2373                 tc[0] = endtc[0];
2374                 tc[1] = endtc[1];
2375                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
2376                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
2377                 substep[0] = (endtc[0] - tc[0]) * subscale;
2378                 substep[1] = (endtc[1] - tc[1]) * subscale;
2379                 subtc[0] = tc[0] * (1<<16);
2380                 subtc[1] = tc[1] * (1<<16);
2381                 if (filter)
2382                 {
2383                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2384                         {
2385                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2386                                 {
2387                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2388                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2389                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2390                                         tci[0] = subtc[0]>>16;
2391                                         tci[1] = subtc[1]>>16;
2392                                         tci1[0] = tci[0] + 1;
2393                                         tci1[1] = tci[1] + 1;
2394                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2395                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2396                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2397                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2398                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2399                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2400                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2401                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2402                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2403                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2404                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2405                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2406                                         out4f[x*4+0] = c[0];
2407                                         out4f[x*4+1] = c[1];
2408                                         out4f[x*4+2] = c[2];
2409                                         out4f[x*4+3] = c[3];
2410                                 }
2411                         }
2412                         else
2413                         {
2414                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2415                                 {
2416                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2417                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2418                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2419                                         tci[0] = subtc[0]>>16;
2420                                         tci[1] = subtc[1]>>16;
2421                                         tci1[0] = tci[0] + 1;
2422                                         tci1[1] = tci[1] + 1;
2423                                         tci[0] &= tciwrapmask[0];
2424                                         tci[1] &= tciwrapmask[1];
2425                                         tci1[0] &= tciwrapmask[0];
2426                                         tci1[1] &= tciwrapmask[1];
2427                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2428                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2429                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2430                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2431                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2432                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2433                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2434                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2435                                         out4f[x*4+0] = c[0];
2436                                         out4f[x*4+1] = c[1];
2437                                         out4f[x*4+2] = c[2];
2438                                         out4f[x*4+3] = c[3];
2439                                 }
2440                         }
2441                 }
2442                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2443                 {
2444                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2445                         {
2446                                 tci[0] = subtc[0]>>16;
2447                                 tci[1] = subtc[1]>>16;
2448                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2449                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2450                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2451                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2452                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2453                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2454                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2455                                 out4f[x*4+0] = c[0];
2456                                 out4f[x*4+1] = c[1];
2457                                 out4f[x*4+2] = c[2];
2458                                 out4f[x*4+3] = c[3];
2459                         }
2460                 }
2461                 else
2462                 {
2463                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2464                         {
2465                                 tci[0] = subtc[0]>>16;
2466                                 tci[1] = subtc[1]>>16;
2467                                 tci[0] &= tciwrapmask[0];
2468                                 tci[1] &= tciwrapmask[1];
2469                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2470                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2471                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2472                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2473                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2474                                 out4f[x*4+0] = c[0];
2475                                 out4f[x*4+1] = c[1];
2476                                 out4f[x*4+2] = c[2];
2477                                 out4f[x*4+3] = c[3];
2478                         }
2479                 }
2480         }
2481 }
2482
2483 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2484 {
2485 #ifdef SSE2_PRESENT
2486         int x;
2487         int startx = span->startx;
2488         int endx = span->endx;
2489         int flags;
2490         __m128 data, slope, tcscale;
2491         __m128i tcsize, tcmask, tcoffset, tcmax;
2492         __m128 tc, endtc;
2493         __m128i subtc, substep, endsubtc;
2494         int filter;
2495         int mip;
2496         unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2497         const unsigned char * RESTRICT pixelbase;
2498         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2499         // if no texture is bound, just fill it with white
2500         if (!texture)
2501         {
2502                 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2503                 return;
2504         }
2505         mip = triangle->mip[texunitindex];
2506         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2507         // if this mipmap of the texture is 1 pixel, just fill it with that color
2508         if (texture->mipmap[mip][1] == 4)
2509         {
2510                 unsigned int k = *((const unsigned int *)pixelbase);
2511                 for (x = startx;x < endx;x++)
2512                         outi[x] = k;
2513                 return;
2514         }
2515         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2516         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2517         flags = texture->flags;
2518         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2519         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2520         tcscale = _mm_cvtepi32_ps(tcsize);
2521         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2522         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2523         endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
2524         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2525         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2526         tcmax = _mm_packs_epi32(tcmask, tcmask);
2527         for (x = startx;x < endx;)
2528         {
2529                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2530                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2531                 if (nextsub >= endx)
2532                 {
2533                         nextsub = endsub = endx-1;
2534                         if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2535                 }       
2536                 tc = endtc;
2537                 subtc = endsubtc;
2538                 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
2539                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2540                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2541                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2542                 substep = _mm_slli_epi32(substep, 1);
2543                 if (filter)
2544                 {
2545                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2546                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2547                         {
2548                                 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2549                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2550                                 {
2551                                         const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;                   
2552                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2553                                         tci = _mm_madd_epi16(tci, tcoffset);
2554                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2555                                         ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2556                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2557                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2558                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2559                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2560                                         fracm = _mm_srli_epi16(subtc, 1);
2561                                         pix1 = _mm_add_epi16(pix1,
2562                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2563                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2564                                         pix3 = _mm_add_epi16(pix3,
2565                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2566                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2567                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2568                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2569                                         pix2 = _mm_add_epi16(pix2,
2570                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2571                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2572                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2573                                 }
2574                                 if (x <= endsub)
2575                                 {
2576                                         const unsigned char * RESTRICT ptr1;
2577                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2578                                         tci = _mm_madd_epi16(tci, tcoffset);
2579                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2580                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2581                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2582                                         fracm = _mm_srli_epi16(subtc, 1);
2583                                         pix1 = _mm_add_epi16(pix1,
2584                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2585                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2586                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2587                                         pix1 = _mm_add_epi16(pix1,
2588                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2589                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2590                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2591                                         x++;
2592                                 }
2593                         }
2594                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2595                         {
2596                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2597                                 {
2598                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2599                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2600                                         tci = _mm_madd_epi16(tci, tcoffset);
2601                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2602                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2603                                                                                         _mm_setzero_si128());
2604                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2605                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2606                                                                                         _mm_setzero_si128());
2607                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2608                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2609                                         tci = _mm_madd_epi16(tci, tcoffset);
2610                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2611                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2612                                                                                         _mm_setzero_si128());
2613                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2614                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2615                                                                                         _mm_setzero_si128());
2616                                         fracm = _mm_srli_epi16(subtc, 1);
2617                                         pix1 = _mm_add_epi16(pix1,
2618                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2619                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2620                                         pix3 = _mm_add_epi16(pix3,
2621                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2622                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2623                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2624                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2625                                         pix2 = _mm_add_epi16(pix2,
2626                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2627                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2628                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2629                                 }
2630                                 if (x <= endsub)
2631                                 {
2632                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2633                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2634                                         tci = _mm_madd_epi16(tci, tcoffset);
2635                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2636                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2637                                                                                         _mm_setzero_si128());
2638                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2639                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2640                                                                                         _mm_setzero_si128());
2641                                         fracm = _mm_srli_epi16(subtc, 1);
2642                                         pix1 = _mm_add_epi16(pix1,
2643                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2644                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2645                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2646                                         pix1 = _mm_add_epi16(pix1,
2647                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2648                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2649                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2650                                         x++;
2651                                 }
2652                         }
2653                         else
2654                         {
2655                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2656                                 {
2657                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2658                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2659                                         tci = _mm_madd_epi16(tci, tcoffset);
2660                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2661                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2662                                                                                         _mm_setzero_si128());
2663                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2664                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2665                                                                                         _mm_setzero_si128());
2666                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2667                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2668                                         tci = _mm_madd_epi16(tci, tcoffset);
2669                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2670                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2671                                                                                         _mm_setzero_si128());
2672                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2673                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2674                                                                                         _mm_setzero_si128());
2675                                         fracm = _mm_srli_epi16(subtc, 1);
2676                                         pix1 = _mm_add_epi16(pix1,
2677                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2678                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2679                                         pix3 = _mm_add_epi16(pix3,
2680                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2681                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2682                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2683                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2684                                         pix2 = _mm_add_epi16(pix2,
2685                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2686                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2687                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2688                                 }
2689                                 if (x <= endsub)
2690                                 {
2691                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2692                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2693                                         tci = _mm_madd_epi16(tci, tcoffset);
2694                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2695                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2696                                                                                         _mm_setzero_si128());
2697                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2698                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2699                                                                                         _mm_setzero_si128());
2700                                         fracm = _mm_srli_epi16(subtc, 1);
2701                                         pix1 = _mm_add_epi16(pix1,
2702                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2703                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2704                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2705                                         pix1 = _mm_add_epi16(pix1,
2706                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2707                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2708                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2709                                         x++;
2710                                 }
2711                         }
2712                 }
2713                 else
2714                 {
2715                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2716                         {
2717                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2718                                 {
2719                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2720                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2721                                         tci = _mm_madd_epi16(tci, tcoffset);
2722                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2723                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2724                                 }
2725                                 if (x <= endsub)
2726                                 {
2727                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2728                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2729                                         tci = _mm_madd_epi16(tci, tcoffset);
2730                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2731                                         x++;
2732                                 }
2733                         }
2734                         else
2735                         {
2736                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2737                                 {
2738                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2739                                         tci = _mm_and_si128(tci, tcmax); 
2740                                         tci = _mm_madd_epi16(tci, tcoffset);
2741                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2742                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2743                                 }
2744                                 if (x <= endsub)
2745                                 {
2746                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2747                                         tci = _mm_and_si128(tci, tcmax); 
2748                                         tci = _mm_madd_epi16(tci, tcoffset);
2749                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2750                                         x++;
2751                                 }
2752                         }
2753                 }
2754         }
2755 #endif
2756 }
2757
2758 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2759 {
2760         // TODO: IMPLEMENT
2761         memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2762 }
2763
2764 float DPSOFTRAST_SampleShadowmap(const float *vector)
2765 {
2766         // TODO: IMPLEMENT
2767         return 1.0f;
2768 }
2769
2770 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2771 {
2772         int x;
2773         int startx = span->startx;
2774         int endx = span->endx;
2775         float c[4];
2776         float data[4];
2777         float slope[4];
2778         float z;
2779         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2780         for (x = startx;x < endx;x++)
2781         {
2782                 z = zf[x];
2783                 c[0] = (data[0] + slope[0]*x) * z;
2784                 c[1] = (data[1] + slope[1]*x) * z;
2785                 c[2] = (data[2] + slope[2]*x) * z;
2786                 c[3] = (data[3] + slope[3]*x) * z;
2787                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2788                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2789                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2790                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2791         }
2792 }
2793
2794 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2795 {
2796         int x;
2797         int startx = span->startx;
2798         int endx = span->endx;
2799         float c[4];
2800         float data[4];
2801         float slope[4];
2802         float z;
2803         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2804         for (x = startx;x < endx;x++)
2805         {
2806                 z = zf[x];
2807                 c[0] = (data[0] + slope[0]*x) * z;
2808                 c[1] = (data[1] + slope[1]*x) * z;
2809                 c[2] = (data[2] + slope[2]*x) * z;
2810                 c[3] = (data[3] + slope[3]*x) * z;
2811                 out4f[x*4+0] = c[0];
2812                 out4f[x*4+1] = c[1];
2813                 out4f[x*4+2] = c[2];
2814                 out4f[x*4+3] = c[3];
2815         }
2816 }
2817
2818 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2819 {
2820         int x, startx = span->startx, endx = span->endx;
2821         float c[4], localcolor[4];
2822         localcolor[0] = subcolor[0];
2823         localcolor[1] = subcolor[1];
2824         localcolor[2] = subcolor[2];
2825         localcolor[3] = subcolor[3];
2826         for (x = startx;x < endx;x++)
2827         {
2828                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2829                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2830                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2831                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2832                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2833                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2834                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2835                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2836         }
2837 }
2838
2839 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2840 {
2841         int x, startx = span->startx, endx = span->endx;
2842         for (x = startx;x < endx;x++)
2843         {
2844                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2845                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2846                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2847                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2848         }
2849 }
2850
2851 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2852 {
2853         int x, startx = span->startx, endx = span->endx;
2854         for (x = startx;x < endx;x++)
2855         {
2856                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2857                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2858                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2859                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2860         }
2861 }
2862
2863 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2864 {
2865         int x, startx = span->startx, endx = span->endx;
2866         float a, b;
2867         for (x = startx;x < endx;x++)
2868         {
2869                 a = 1.0f - inb4f[x*4+3];
2870                 b = inb4f[x*4+3];
2871                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2872                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2873                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2874                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2875         }
2876 }
2877
2878 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2879 {
2880         int x, startx = span->startx, endx = span->endx;
2881         float localcolor[4], ilerp, lerp;
2882         localcolor[0] = color[0];
2883         localcolor[1] = color[1];
2884         localcolor[2] = color[2];
2885         localcolor[3] = color[3];
2886         ilerp = 1.0f - localcolor[3];
2887         lerp = localcolor[3];
2888         for (x = startx;x < endx;x++)
2889         {
2890                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2891                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2892                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2893                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2894         }
2895 }
2896
2897
2898
2899 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2900 {
2901 #ifdef SSE2_PRESENT
2902         int x;
2903         int startx = span->startx;
2904         int endx = span->endx;
2905         __m128 data, slope;
2906         __m128 mod, endmod;
2907         __m128i submod, substep, endsubmod;
2908         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2909         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2910         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2911         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2912         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2913         for (x = startx; x < endx;)
2914         {
2915                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2916                 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2917                 if (nextsub >= endx)
2918                 {
2919                         nextsub = endsub = endx-1;
2920                         if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2921                 }
2922                 mod = endmod;
2923                 submod = endsubmod;
2924                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2925                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2926                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2927                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2928                 substep = _mm_packs_epi32(substep, substep);
2929                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2930                 {
2931                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2932                         pix = _mm_mulhi_epu16(pix, submod);
2933                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2934                 }
2935                 if (x <= endsub)
2936                 {
2937                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2938                         pix = _mm_mulhi_epu16(pix, submod);
2939                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2940                         x++;
2941                 }
2942         }
2943 #endif
2944 }
2945
2946 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2947 {
2948 #ifdef SSE2_PRESENT
2949         int x;
2950         int startx = span->startx;
2951         int endx = span->endx;
2952         __m128 data, slope;
2953         __m128 mod, endmod;
2954         __m128i submod, substep, endsubmod;
2955         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2956         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2957         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2958         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2959         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2960         for (x = startx; x < endx;)
2961         {
2962                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2963                 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2964                 if (nextsub >= endx)
2965                 {
2966                         nextsub = endsub = endx-1;
2967                         if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
2968                 }
2969                 mod = endmod;
2970                 submod = endsubmod;
2971                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2972                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2973                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2974                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2975                 substep = _mm_packs_epi32(substep, substep);
2976                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2977                 {
2978                         __m128i pix = _mm_srai_epi16(submod, 4);
2979                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2980                 }
2981                 if (x <= endsub)
2982                 {
2983                         __m128i pix = _mm_srai_epi16(submod, 4);
2984                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2985                         x++;
2986                 }
2987         }
2988 #endif
2989 }
2990
2991 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
2992 {
2993 #ifdef SSE2_PRESENT
2994         int x, startx = span->startx, endx = span->endx;
2995         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
2996         localcolor = _mm_packs_epi32(localcolor, localcolor);
2997         for (x = startx;x+2 <= endx;x+=2)
2998         {
2999                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3000                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3001                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3002                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3003         }
3004         if (x < endx)
3005         {
3006                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3007                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3008                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3009                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3010         }
3011 #endif
3012 }
3013
3014 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3015 {
3016 #ifdef SSE2_PRESENT
3017         int x, startx = span->startx, endx = span->endx;
3018         for (x = startx;x+2 <= endx;x+=2)
3019         {
3020                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3021                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3022                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3023                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3024         }
3025         if (x < endx)
3026         {
3027                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3028                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3029                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3030                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3031         }
3032 #endif
3033 }
3034
3035 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3036 {
3037 #ifdef SSE2_PRESENT
3038         int x, startx = span->startx, endx = span->endx;
3039         for (x = startx;x+2 <= endx;x+=2)
3040         {
3041                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3042                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3043                 pix1 = _mm_add_epi16(pix1, pix2);
3044                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3045         }
3046         if (x < endx)
3047         {
3048                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3049                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3050                 pix1 = _mm_add_epi16(pix1, pix2);
3051                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3052         }
3053 #endif
3054 }
3055
3056 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3057 {
3058 #ifdef SSE2_PRESENT
3059         int x, startx = span->startx, endx = span->endx;
3060         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3061         tint = _mm_packs_epi32(tint, tint);
3062         for (x = startx;x+2 <= endx;x+=2)
3063         {
3064                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3065                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3066                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3067                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3068         }
3069         if (x < endx)
3070         {
3071                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3072                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3073                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3074                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3075         }
3076 #endif
3077 }
3078
3079 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3080 {
3081 #ifdef SSE2_PRESENT
3082         int x, startx = span->startx, endx = span->endx;
3083         for (x = startx;x+2 <= endx;x+=2)
3084         {
3085                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3086                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3087                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3088                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3089                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3090         }
3091         if (x < endx)
3092         {
3093                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3094                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3095                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3096                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3097                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3098         }
3099 #endif
3100 }
3101
3102 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3103 {
3104 #ifdef SSE2_PRESENT
3105         int x, startx = span->startx, endx = span->endx;
3106         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3107         localcolor = _mm_packs_epi32(localcolor, localcolor);
3108         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3109         for (x = startx;x+2 <= endx;x+=2)
3110         {
3111                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3112                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3113                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3114         }
3115         if (x < endx)
3116         {
3117                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3118                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3119                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3120         }
3121 #endif
3122 }
3123
3124
3125
3126 void DPSOFTRAST_VertexShader_Generic(void)
3127 {
3128         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3129         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3130         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3131         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3132                 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3133 }
3134
3135 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3136 {
3137         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3138         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3139         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3140         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3141         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3142         if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3143         {
3144                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3145                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3146                 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3147                 {
3148                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3149                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3150                         {
3151                                 // multiply
3152                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3153                         }
3154                         else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3155                         {
3156                                 // add
3157                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3158                         }
3159                         else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3160                         {
3161                                 // alphablend
3162                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3163                         }
3164                 }
3165         }
3166         else
3167                 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3168         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3169 }
3170
3171
3172
3173 void DPSOFTRAST_VertexShader_PostProcess(void)
3174 {
3175         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3176         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3177         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3178 }
3179
3180 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3181 {
3182         // TODO: optimize!!  at the very least there is no reason to use texture sampling on the frame texture
3183         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3184         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3185         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3186         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3187         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3188         if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3189         {
3190                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3191                 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3192         }
3193         DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3194         if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3195         {
3196                 // TODO: implement saturation
3197         }
3198         if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3199         {
3200                 // TODO: implement gammaramps
3201         }
3202         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3203 }
3204
3205
3206
3207 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3208 {
3209         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3210 }
3211
3212 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3213 {
3214         // this is never called (because colormask is off when this shader is used)
3215         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3216         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3217         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3218         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3219         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3220 }
3221
3222
3223
3224 void DPSOFTRAST_VertexShader_FlatColor(void)
3225 {
3226         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3227         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3228 }
3229
3230 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3231 {
3232 #ifdef SSE2_PRESENT
3233         unsigned char * RESTRICT pixelmask = span->pixelmask;
3234         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3235         int x, startx = span->startx, endx = span->endx;
3236         __m128i Color_Ambientm;
3237         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3238         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3239         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3240         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3241         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3242         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3243                 pixel = buffer_FragColorbgra8;
3244         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3245         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3246         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3247         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3248         for (x = startx;x < endx;x++)
3249         {
3250                 __m128i color, pix;
3251                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3252                 {
3253                         __m128i pix2;
3254                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3255                         pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3256                         pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3257                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3258                         x += 3;
3259                         continue;
3260                 }
3261                 if (!pixelmask[x])
3262                         continue;
3263                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3264                 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3265                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3266         }
3267         if (pixel == buffer_FragColorbgra8)
3268                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3269 #endif
3270 }
3271
3272
3273
3274 void DPSOFTRAST_VertexShader_VertexColor(void)
3275 {
3276         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3277         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3278         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3279 }
3280
3281 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3282 {
3283 #ifdef SSE2_PRESENT
3284         unsigned char * RESTRICT pixelmask = span->pixelmask;
3285         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3286         int x, startx = span->startx, endx = span->endx;
3287         __m128i Color_Ambientm, Color_Diffusem;
3288         __m128 data, slope;
3289         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3290         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3291         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3292         int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3293         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3294         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3295         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3296                 pixel = buffer_FragColorbgra8;
3297         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3298         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3299         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3300         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3301         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3302         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3303         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3304         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3305         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3306         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3307         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3308         data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3309         slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3310         for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3311         {
3312                 __m128i color, mod, pix;
3313                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3314                 {
3315                         __m128i pix2, mod2;
3316                         __m128 z = _mm_loadu_ps(&buffer_z[x]);
3317                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3318                         mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3319                         data = _mm_add_ps(data, slope);
3320                         mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3321                         data = _mm_add_ps(data, slope);
3322                         mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3323                         data = _mm_add_ps(data, slope);
3324                         mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3325                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3326                                                                   _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3327                         pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3328                                                                    _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3329                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3330                         x += 3;
3331                         continue;
3332                 }
3333                 if (!pixelmask[x])
3334                         continue;
3335                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3336                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
3337                 mod = _mm_packs_epi32(mod, mod);
3338                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3339                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3340         }
3341         if (pixel == buffer_FragColorbgra8)
3342                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3343 #endif
3344 }
3345
3346
3347
3348 void DPSOFTRAST_VertexShader_Lightmap(void)
3349 {
3350         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3351         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3352         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3353 }
3354
3355 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3356 {
3357 #ifdef SSE2_PRESENT
3358         unsigned char * RESTRICT pixelmask = span->pixelmask;
3359         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3360         int x, startx = span->startx, endx = span->endx;
3361         __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3362         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3363         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3364         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3365         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3366         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3367         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3368         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3369         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3370         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3371                 pixel = buffer_FragColorbgra8;
3372         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3373         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3374         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3375         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3376         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3377         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3378         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3379         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3380         {
3381                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3382                 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3383                 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3384                 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3385                 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3386                 for (x = startx;x < endx;x++)
3387                 {
3388                         __m128i color, lightmap, glow, pix;
3389                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3390                         {
3391                                 __m128i pix2;
3392                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3393                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3394                                 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3395                                 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3396                                                                                                         _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3397                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3398                                 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3399                                                                                                         _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3400                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3401                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3402                                 x += 3;
3403                                 continue;
3404                         }
3405                         if (!pixelmask[x])
3406                                 continue;
3407                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3408                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3409                         glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3410                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3411                         pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3412                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3413                 }
3414         }
3415         else
3416         {
3417                 for (x = startx;x < endx;x++)
3418                 {
3419                         __m128i color, lightmap, pix;
3420                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3421                         {
3422                                 __m128i pix2;
3423                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3424                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3425                                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3426                                                                           _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3427                                 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3428                                                                            _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3429                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3430                                 x += 3;
3431                                 continue;
3432                         }
3433                         if (!pixelmask[x]) 
3434                                 continue;
3435                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3436                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3437                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3438                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3439                 }
3440         }
3441         if (pixel == buffer_FragColorbgra8)
3442                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3443 #endif
3444 }
3445
3446
3447
3448 void DPSOFTRAST_VertexShader_FakeLight(void)
3449 {
3450         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3451 }
3452
3453 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3454 {
3455         // TODO: IMPLEMENT
3456         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3457         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3458         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3459         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3460         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3461 }
3462
3463
3464
3465 void DPSOFTRAST_VertexShader_LightDirection(void);
3466 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3467 {
3468         DPSOFTRAST_VertexShader_LightDirection();
3469         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3470 }
3471
3472 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3473 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3474 {
3475         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3476 }
3477
3478
3479
3480 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3481 {
3482         DPSOFTRAST_VertexShader_Lightmap();
3483 }
3484
3485 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3486 {
3487         DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3488         // TODO: IMPLEMENT
3489 }
3490
3491
3492
3493 void DPSOFTRAST_VertexShader_LightDirection(void)
3494 {
3495         int i;
3496         int numvertices = dpsoftrast.numvertices;
3497         float LightDir[4];
3498         float LightVector[4];
3499         float EyePosition[4];
3500         float EyeVectorModelSpace[4];
3501         float EyeVector[4];
3502         float position[4];
3503         float svector[4];
3504         float tvector[4];
3505         float normal[4];
3506         LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3507         LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3508         LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3509         LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3510         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3511         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3512         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3513         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3514         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3515         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3516         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3517         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3518         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3519         for (i = 0;i < numvertices;i++)
3520         {
3521                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3522                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3523                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3524                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3525                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3526                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3527                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3528                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3529                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3530                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3531                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3532                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3533                 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3534                 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3535                 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3536                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3537                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3538                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3539                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3540                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3541                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3542                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3543                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3544                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3545                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3546                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3547                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3548                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3549                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3550         }
3551         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3552 }
3553
3554 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3555 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3556 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3557 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3558 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3559 #define DPSOFTRAST_Vector3Normalize(v)\
3560 do\
3561 {\
3562         float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3563         if (len)\
3564         {\
3565                 len = 1.0f / len;\
3566                 v[0] *= len;\
3567                 v[1] *= len;\
3568                 v[2] *= len;\
3569         }\
3570 }\
3571 while(0)
3572
3573 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3574 {
3575         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3576         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3577         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3578         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3579         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3580         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3581         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3582         unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3583         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3584         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3585         int x, startx = span->startx, endx = span->endx;
3586         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3587         float LightVectordata[4];
3588         float LightVectorslope[4];
3589         float EyeVectordata[4];
3590         float EyeVectorslope[4];
3591         float VectorSdata[4];
3592         float VectorSslope[4];
3593         float VectorTdata[4];
3594         float VectorTslope[4];
3595         float VectorRdata[4];
3596         float VectorRslope[4];
3597         float z;
3598         float diffusetex[4];
3599         float glosstex[4];
3600         float surfacenormal[4];
3601         float lightnormal[4];
3602         float lightnormal_modelspace[4];
3603         float eyenormal[4];
3604         float specularnormal[4];
3605         float diffuse;
3606         float specular;
3607         float SpecularPower;
3608         int d[4];
3609         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3610         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3611         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3612         Color_Glow[3] = 0.0f;
3613         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3614         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3615         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3616         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3617         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3618         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3619         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3620         Color_Pants[3] = 0.0f;
3621         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3622         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3623         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3624         Color_Shirt[3] = 0.0f;
3625         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3626         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3627         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3628         {
3629                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3630                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3631         }
3632         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3633         {
3634                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3635         }
3636         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3637         {
3638                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3639                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3640                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3641                 Color_Diffuse[3] = 0.0f;
3642                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3643                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3644                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3645                 LightColor[3] = 0.0f;
3646                 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3647                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3648                 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3649                 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3650                 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3651                 Color_Specular[3] = 0.0f;
3652                 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3653                 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3654                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3655
3656                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3657                 {
3658                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3659                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3660                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3661                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3662                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3663                 }
3664
3665                 for (x = startx;x < endx;x++)
3666                 {
3667                         z = buffer_z[x];
3668                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3669                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3670                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3671                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3672                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3673                         {
3674                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3675                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3676                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3677                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3678                         }
3679                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3680                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3681                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3682                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3683                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3684                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3685                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3686                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3687
3688                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3689                         {
3690                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3691                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3692                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3693                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3694
3695                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3696                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3697                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3698                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3699
3700                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3701                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3702                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3703                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3704
3705                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3706                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3707                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3708                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3709
3710                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3711                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3712
3713                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3714                                 {
3715                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3716                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3717                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3718                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3719                                 }
3720                         }
3721                         else
3722                         {
3723                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3724                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3725                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3726                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3727                         }
3728
3729                         eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3730                         eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3731                         eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3732                         DPSOFTRAST_Vector3Normalize(eyenormal);
3733
3734                         specularnormal[0] = lightnormal[0] + eyenormal[0];
3735                         specularnormal[1] = lightnormal[1] + eyenormal[1];
3736                         specularnormal[2] = lightnormal[2] + eyenormal[2];
3737                         DPSOFTRAST_Vector3Normalize(specularnormal);
3738
3739                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3740                         specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3741                         specular = pow(specular, SpecularPower * glosstex[3]);
3742                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3743                         {
3744                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3745                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3746                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3747                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3748                         }
3749                         else
3750                         {
3751                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3752                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3753                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3754                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3755                         }
3756
3757                         buffer_FragColorbgra8[x*4+0] = d[0];
3758                         buffer_FragColorbgra8[x*4+1] = d[1];
3759                         buffer_FragColorbgra8[x*4+2] = d[2];
3760                         buffer_FragColorbgra8[x*4+3] = d[3];
3761                 }
3762         }
3763         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3764         {
3765                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3766                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3767                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3768                 Color_Diffuse[3] = 0.0f;
3769                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3770                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3771                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3772                 LightColor[3] = 0.0f;
3773                 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3774                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3775
3776                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3777                 {
3778                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3779                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3780                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3781                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3782                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3783                 }
3784
3785                 for (x = startx;x < endx;x++)
3786                 {
3787                         z = buffer_z[x];
3788                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3789                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3790                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3791                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3792                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3793                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3794                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3795                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3796
3797                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3798                         {
3799                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3800                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3801                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3802                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3803
3804                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3805                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3806                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3807                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3808
3809                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3810                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3811                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3812                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3813
3814                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3815                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3816                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3817                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3818
3819                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3820                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3821
3822                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3823                                 {
3824                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3825                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3826                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3827                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3828                                 }
3829                         }
3830                         else
3831                         {
3832                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3833                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3834                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3835                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3836                         }
3837
3838                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3839                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3840                         {
3841                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3842                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3843                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3844                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3845                         }
3846                         else
3847                         {
3848                                 d[0] = (int)(                                                + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3849                                 d[1] = (int)(                                                + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3850                                 d[2] = (int)(                                                + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3851                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3852                         }
3853                         buffer_FragColorbgra8[x*4+0] = d[0];
3854                         buffer_FragColorbgra8[x*4+1] = d[1];
3855                         buffer_FragColorbgra8[x*4+2] = d[2];
3856                         buffer_FragColorbgra8[x*4+3] = d[3];
3857                 }
3858         }
3859         else
3860         {
3861                 for (x = startx;x < endx;x++)
3862                 {
3863                         z = buffer_z[x];
3864                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3865                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3866                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3867                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3868
3869                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3870                         {
3871                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3872                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3873                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3874                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3875                         }
3876                         else
3877                         {
3878                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3879                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3880                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3881                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3882                         }
3883                         buffer_FragColorbgra8[x*4+0] = d[0];
3884                         buffer_FragColorbgra8[x*4+1] = d[1];
3885                         buffer_FragColorbgra8[x*4+2] = d[2];
3886                         buffer_FragColorbgra8[x*4+3] = d[3];
3887                 }
3888         }
3889         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3890 }
3891
3892
3893
3894 void DPSOFTRAST_VertexShader_LightSource(void)
3895 {
3896         int i;
3897         int numvertices = dpsoftrast.numvertices;
3898         float LightPosition[4];
3899         float LightVector[4];
3900         float LightVectorModelSpace[4];
3901         float EyePosition[4];
3902         float EyeVectorModelSpace[4];
3903         float EyeVector[4];
3904         float position[4];
3905         float svector[4];
3906         float tvector[4];
3907         float normal[4];
3908         LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
3909         LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
3910         LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
3911         LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
3912         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3913         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3914         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3915         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3916         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3917         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3918         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3919         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3920         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3921         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3922         for (i = 0;i < numvertices;i++)
3923         {
3924                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3925                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3926                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3927                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3928                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3929                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3930                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3931                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3932                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3933                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3934                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3935                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3936                 LightVectorModelSpace[0] = LightPosition[0] - position[0];
3937                 LightVectorModelSpace[1] = LightPosition[1] - position[1];
3938                 LightVectorModelSpace[2] = LightPosition[2] - position[2];
3939                 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
3940                 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
3941                 LightVector[2] = normal[0]  * LightVectorModelSpace[0] + normal[1]  * LightVectorModelSpace[1] + normal[2]  * LightVectorModelSpace[2];
3942                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3943                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3944                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3945                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3946                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3947                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3948                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3949                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3950                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3951                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3952                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3953                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3954                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3955                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3956         }
3957         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3958         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
3959 }
3960
3961 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3962 {
3963 #ifdef SSE2_PRESENT
3964         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3965         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3966         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3967         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3968         unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3969         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3970         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3971         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3972         int x, startx = span->startx, endx = span->endx;
3973         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3974         float CubeVectordata[4];
3975         float CubeVectorslope[4];
3976         float LightVectordata[4];
3977         float LightVectorslope[4];
3978         float EyeVectordata[4];
3979         float EyeVectorslope[4];
3980         float z;
3981         float diffusetex[4];
3982         float glosstex[4];
3983         float surfacenormal[4];
3984         float lightnormal[4];
3985         float eyenormal[4];
3986         float specularnormal[4];
3987         float diffuse;
3988         float specular;
3989         float SpecularPower;
3990         float CubeVector[4];
3991         float attenuation;
3992         int d[4];
3993         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3994         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3995         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3996         Color_Glow[3] = 0.0f;
3997         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3998         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3999         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4000         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4001         Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4002         Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4003         Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4004         Color_Diffuse[3] = 0.0f;
4005         Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4006         Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4007         Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4008         Color_Specular[3] = 0.0f;
4009         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4010         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4011         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4012         Color_Pants[3] = 0.0f;
4013         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4014         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4015         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4016         Color_Shirt[3] = 0.0f;
4017         LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4018         LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4019         LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4020         LightColor[3] = 0.0f;
4021         SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4022         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4023         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4024         DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4025         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4026         memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4027         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4028         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4029         {
4030                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4031                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4032         }
4033         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4034                 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4035         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4036         {
4037                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4038                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4039                 for (x = startx;x < endx;x++)
4040                 {
4041                         z = buffer_z[x];
4042                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4043                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4044                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4045                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4046                         if (attenuation < 0.01f)
4047                                 continue;
4048                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4049                         {
4050                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4051                                 if (attenuation < 0.01f)
4052                                         continue;
4053                         }
4054
4055                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4056                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4057                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4058                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4059                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4060                         {
4061                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4062                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4063                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4064                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4065                         }
4066                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4067                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4068                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4069                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4070                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4071                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4072                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4073                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4074
4075                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4076                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4077                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4078                         DPSOFTRAST_Vector3Normalize(lightnormal);
4079
4080                         eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4081                         eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4082                         eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4083                         DPSOFTRAST_Vector3Normalize(eyenormal);
4084
4085                         specularnormal[0] = lightnormal[0] + eyenormal[0];
4086                         specularnormal[1] = lightnormal[1] + eyenormal[1];
4087                         specularnormal[2] = lightnormal[2] + eyenormal[2];
4088                         DPSOFTRAST_Vector3Normalize(specularnormal);
4089
4090                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4091                         specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4092                         specular = pow(specular, SpecularPower * glosstex[3]);
4093                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4094                         {
4095                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4096                                 attenuation *= (1.0f / 255.0f);
4097                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4098                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4099                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4100                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4101                         }
4102                         else
4103                         {
4104                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4105                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4106                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4107                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4108                         }
4109                         buffer_FragColorbgra8[x*4+0] = d[0];
4110                         buffer_FragColorbgra8[x*4+1] = d[1];
4111                         buffer_FragColorbgra8[x*4+2] = d[2];
4112                         buffer_FragColorbgra8[x*4+3] = d[3];
4113                 }
4114         }
4115         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4116         {
4117                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4118                 for (x = startx;x < endx;x++)
4119                 {
4120                         z = buffer_z[x];
4121                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4122                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4123                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4124                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4125                         if (attenuation < 0.01f)
4126                                 continue;
4127                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4128                         {
4129                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4130                                 if (attenuation < 0.01f)
4131                                         continue;
4132                         }
4133
4134                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4135                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4136                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4137                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4138                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4139                         {
4140                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4141                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4142                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4143                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4144                         }
4145                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4146                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4147                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4148                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4149
4150                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4151                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4152                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4153                         DPSOFTRAST_Vector3Normalize(lightnormal);
4154
4155                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4156                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4157                         {
4158                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4159                                 attenuation *= (1.0f / 255.0f);
4160                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4161                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4162                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4163                                 d[3] = (int)( diffusetex[3]                                                                                                                   );if (d[3] > 255) d[3] = 255;
4164                         }
4165                         else
4166                         {
4167                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4168                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4169                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4170                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4171                         }
4172                         buffer_FragColorbgra8[x*4+0] = d[0];
4173                         buffer_FragColorbgra8[x*4+1] = d[1];
4174                         buffer_FragColorbgra8[x*4+2] = d[2];
4175                         buffer_FragColorbgra8[x*4+3] = d[3];
4176                 }
4177         }
4178         else
4179         {
4180                 for (x = startx;x < endx;x++)
4181                 {
4182                         z = buffer_z[x];
4183                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4184                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4185                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4186                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4187                         if (attenuation < 0.01f)
4188                                 continue;
4189                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4190                         {
4191                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4192                                 if (attenuation < 0.01f)
4193                                         continue;
4194                         }
4195
4196                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4197                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4198                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4199                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4200                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4201                         {
4202                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4203                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4204                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4205                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4206                         }
4207                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4208                         {
4209                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4210                                 attenuation *= (1.0f / 255.0f);
4211                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4212                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4213                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4214                                 d[3] = (int)( diffusetex[3]                                                                                      );if (d[3] > 255) d[3] = 255;
4215                         }
4216                         else
4217                         {
4218                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4219                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4220                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4221                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4222                         }
4223                         buffer_FragColorbgra8[x*4+0] = d[0];
4224                         buffer_FragColorbgra8[x*4+1] = d[1];
4225                         buffer_FragColorbgra8[x*4+2] = d[2];
4226                         buffer_FragColorbgra8[x*4+3] = d[3];
4227                 }
4228         }
4229         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4230 #endif
4231 }
4232
4233
4234
4235 void DPSOFTRAST_VertexShader_Refraction(void)
4236 {
4237         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4238 }
4239
4240 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4241 {
4242         // TODO: IMPLEMENT
4243         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4244         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4245         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4246         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4247         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4248 }
4249
4250
4251
4252 void DPSOFTRAST_VertexShader_Water(void)
4253 {
4254         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4255 }
4256
4257
4258 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4259 {
4260         // TODO: IMPLEMENT
4261         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4262         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4263         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4264         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4265         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4266 }
4267
4268
4269
4270 void DPSOFTRAST_VertexShader_ShowDepth(void)
4271 {
4272         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4273 }
4274
4275 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4276 {
4277         // TODO: IMPLEMENT
4278         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4279         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4280         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4281         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4282         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4283 }
4284
4285
4286
4287 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4288 {
4289         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4290 }
4291
4292 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4293 {
4294         // TODO: IMPLEMENT
4295         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4296         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4297         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4298         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4299         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4300 }
4301
4302
4303
4304 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4305 {
4306         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4307 }
4308
4309 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4310 {
4311         // TODO: IMPLEMENT
4312         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4313         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4314         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4315         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4316         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4317 }
4318
4319
4320
4321 typedef struct DPSOFTRAST_ShaderModeInfo_s
4322 {
4323         int lodarrayindex;
4324         void (*Vertex)(void);
4325         void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4326         unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4327         unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4328 }
4329 DPSOFTRAST_ShaderModeInfo;
4330
4331 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4332 {
4333         {2, DPSOFTRAST_VertexShader_Generic,                        DPSOFTRAST_PixelShader_Generic,                        {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4334         {2, DPSOFTRAST_VertexShader_PostProcess,                    DPSOFTRAST_PixelShader_PostProcess,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4335         {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow,                DPSOFTRAST_PixelShader_Depth_Or_Shadow,                {~0}, {~0}},
4336         {2, DPSOFTRAST_VertexShader_FlatColor,                      DPSOFTRAST_PixelShader_FlatColor,                      {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4337         {2, DPSOFTRAST_VertexShader_VertexColor,                    DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4338         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4339         {2, DPSOFTRAST_VertexShader_FakeLight,                      DPSOFTRAST_PixelShader_FakeLight,                      {~0}, {~0}},
4340         {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace,   {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4341         {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4342         {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,                 {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4343         {2, DPSOFTRAST_VertexShader_LightSource,                    DPSOFTRAST_PixelShader_LightSource,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4344         {2, DPSOFTRAST_VertexShader_Refraction,                     DPSOFTRAST_PixelShader_Refraction,                     {~0}},
4345         {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                          {~0}},
4346         {2, DPSOFTRAST_VertexShader_ShowDepth,                      DPSOFTRAST_PixelShader_ShowDepth,                      {~0}},
4347         {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,               {~0}},
4348         {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}},
4349 };
4350
4351 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4352 {
4353         int i;
4354         int x;
4355         int startx;
4356         int endx;
4357 //      unsigned int c;
4358 //      unsigned int *colorpixel;
4359         unsigned int *depthpixel;
4360         float w;
4361         float wslope;
4362         int depth;
4363         int depthslope;
4364         unsigned int d;
4365         DPSOFTRAST_State_Triangle *triangle;
4366         DPSOFTRAST_State_Span *span;
4367         unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4368         for (i = 0; i < thread->numspans; i++)
4369         {
4370                 span = &thread->spans[i];
4371                 triangle = &thread->triangles[span->triangle];
4372                 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4373                 {
4374                         wslope = triangle->w[0];
4375                         w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4376                         depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4377                         depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4378                         depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4379                         startx = span->startx;
4380                         endx = span->endx;
4381                         switch(thread->fb_depthfunc)
4382                         {
4383                         default:
4384                         case GL_ALWAYS:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4385                         case GL_LESS:    for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4386                         case GL_LEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4387                         case GL_EQUAL:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4388                         case GL_GEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4389                         case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4390                         case GL_NEVER:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4391                         }
4392                         //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4393                         //for (x = startx;x < endx;x++)
4394                         //      colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4395                         // if there is no color buffer, skip pixel shader
4396                         while (startx < endx && !pixelmask[startx])
4397                                 startx++;
4398                         while (endx > startx && !pixelmask[endx-1])
4399                                 endx--;
4400                         if (startx >= endx)
4401                                 continue; // no pixels to fill
4402                         span->pixelmask = pixelmask;
4403                         span->startx = startx;
4404                         span->endx = endx;
4405                         // run pixel shader if appropriate
4406                         // do this before running depthmask code, to allow the pixelshader
4407                         // to clear pixelmask values for alpha testing
4408                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4409                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4410                         if (thread->depthmask)
4411                                 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4412                                         if (pixelmask[x])
4413                                                 depthpixel[x] = d;
4414                 }
4415                 else
4416                 {
4417                         // no depth testing means we're just dealing with color...
4418                         // if there is no color buffer, skip pixel shader
4419                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4420                         {
4421                                 memset(pixelmask + span->startx, 1, span->endx - span->startx);
4422                                 span->pixelmask = pixelmask;
4423                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4424                         }
4425                 }
4426         }
4427         thread->numspans = 0;
4428 }
4429
4430 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4431
4432 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4433 {
4434 #ifdef SSE2_PRESENT
4435         int cullface = thread->cullface;
4436         int minx, maxx, miny, maxy;
4437         int miny1, maxy1, miny2, maxy2;
4438         __m128i fbmin, fbmax;
4439         __m128 viewportcenter, viewportscale;
4440         int firstvertex = command->firstvertex;
4441         int numvertices = command->numvertices;
4442         int numtriangles = command->numtriangles;
4443         const int *element3i = command->element3i;
4444         const unsigned short *element3s = command->element3s;
4445         int clipped = command->clipped;
4446         int i;
4447         int j;
4448         int k;
4449         int y;
4450         int e[3];
4451         __m128i screeny;
4452         int starty, endy, bandy;
4453         int numpoints;
4454         int clipcase;
4455         float clipdist[4];
4456         __m128 triangleedge1, triangleedge2, trianglenormal;
4457         __m128 clipfrac[3];
4458         __m128 screen[4];
4459         DPSOFTRAST_State_Triangle *triangle;
4460         DPSOFTRAST_Texture *texture;
4461         DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4462         miny = thread->fb_scissor[1];
4463         maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4464         miny1 = bound(miny, thread->miny1, maxy);
4465         maxy1 = bound(miny, thread->maxy1, maxy);
4466         miny2 = bound(miny, thread->miny2, maxy);
4467         maxy2 = bound(miny, thread->maxy2, maxy);
4468         if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4469         {
4470                 if (!ATOMIC_DECREMENT(command->refcount))
4471                 {
4472                         if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4473                                 MM_FREE(command->arrays);
4474                 }
4475                 return;
4476         }
4477         minx = thread->fb_scissor[0];
4478         maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4479         fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4480         fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4481         viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4482         viewportscale = _mm_load_ps(thread->fb_viewportscale);
4483         screen[3] = _mm_setzero_ps();
4484         clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4485         for (i = 0;i < numtriangles;i++)
4486         {
4487                 const float *screencoord4f = command->arrays;
4488                 const float *arrays = screencoord4f + numvertices*4;
4489
4490                 // generate the 3 edges of this triangle
4491                 // generate spans for the triangle - switch based on left split or right split classification of triangle
4492                 if (element3s)
4493                 {
4494                         e[0] = element3s[i*3+0] - firstvertex;
4495                         e[1] = element3s[i*3+1] - firstvertex;
4496                         e[2] = element3s[i*3+2] - firstvertex;
4497                 }
4498                 else if (element3i)
4499                 {
4500                         e[0] = element3i[i*3+0] - firstvertex;
4501                         e[1] = element3i[i*3+1] - firstvertex;
4502                         e[2] = element3i[i*3+2] - firstvertex;
4503                 }
4504                 else
4505                 {
4506                         e[0] = i*3+0;
4507                         e[1] = i*3+1;
4508                         e[2] = i*3+2;
4509                 }
4510
4511 #define SKIPBACKFACE \
4512                 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4513                 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4514                 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4515                 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4516                                                                         _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4517                 switch(cullface) \
4518                 { \
4519                 case GL_BACK: \
4520                         if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4521                                 continue; \
4522                         break; \
4523                 case GL_FRONT: \
4524                         if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4525                                 continue; \
4526                         break; \
4527                 }
4528
4529 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4530                         clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4531                         { \
4532                                 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4533                                 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4534                         }
4535 #define CLIPPEDVERTEXCOPY(k,p1) \
4536                         screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4537
4538 #define GENATTRIBCOPY(attrib, p1) \
4539                 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4540 #define GENATTRIBLERP(attrib, p1, p2) \
4541                 { \
4542                         __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4543                         attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4544                 }
4545 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4546                 switch(clipcase) \
4547                 { \
4548                 default: \
4549                 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4550                 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4551                 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4552                 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4553                 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4554                 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4555                 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4556                 }
4557
4558                 if (! clipped)
4559                         goto notclipped;
4560
4561                 // calculate distance from nearplane
4562                 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4563                 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4564                 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4565                 if (clipdist[0] >= 0.0f)
4566                 {
4567                         if (clipdist[1] >= 0.0f)
4568                         {
4569                                 if (clipdist[2] >= 0.0f)
4570                                 {
4571                                 notclipped:
4572                                         // triangle is entirely in front of nearplane
4573                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4574                                         SKIPBACKFACE;
4575                                         numpoints = 3;
4576                                         clipcase = 0;
4577                                 }
4578                                 else
4579                                 {
4580                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4581                                         SKIPBACKFACE;
4582                                         numpoints = 4;
4583                                         clipcase = 1;
4584                                 }
4585                         }
4586                         else
4587                         {
4588                                 if (clipdist[2] >= 0.0f)
4589                                 {
4590                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4591                                         SKIPBACKFACE;
4592                                         numpoints = 4;
4593                                         clipcase = 2;
4594                                 }
4595                                 else
4596                                 {
4597                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4598                                         SKIPBACKFACE;
4599                                         numpoints = 3;
4600                                         clipcase = 3;
4601                                 }
4602                         }
4603                 }
4604                 else if (clipdist[1] >= 0.0f)
4605                 {
4606                         if (clipdist[2] >= 0.0f)
4607                         {
4608                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4609                                 SKIPBACKFACE;
4610                                 numpoints = 4;
4611                                 clipcase = 4;
4612                         }
4613                         else
4614                         {
4615                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4616                                 SKIPBACKFACE;
4617                                 numpoints = 3;
4618                                 clipcase = 5;
4619                         }
4620                 }
4621                 else if (clipdist[2] >= 0.0f)
4622                 {
4623                         CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4624                         SKIPBACKFACE;
4625                         numpoints = 3;
4626                         clipcase = 6;
4627                 }
4628                 else continue; // triangle is entirely behind nearplane
4629
4630                 {
4631                         // calculate integer y coords for triangle points
4632                         __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4633                                         screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4634                                         screenmin = _mm_min_epi16(screeni, screenir),
4635                                         screenmax = _mm_max_epi16(screeni, screenir);
4636                         screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4637                         screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4638                         screenmin = _mm_max_epi16(screenmin, fbmin);
4639                         screenmax = _mm_min_epi16(screenmax, fbmax);
4640                         // skip offscreen triangles
4641                         if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4642                                 continue;
4643                         starty = _mm_extract_epi16(screenmin, 1);
4644                         endy = _mm_extract_epi16(screenmax, 1)+1;
4645                         if (starty >= maxy1 && endy <= miny2)
4646                                 continue;
4647                         screeny = _mm_srai_epi32(screeni, 16);
4648                 }
4649
4650                 triangle = &thread->triangles[thread->numtriangles];
4651
4652                 // calculate attribute plans for triangle data...
4653                 // okay, this triangle is going to produce spans, we'd better project
4654                 // the interpolants now (this is what gives perspective texturing),
4655                 // this consists of simply multiplying all arrays by the W coord
4656                 // (which is basically 1/Z), which will be undone per-pixel
4657                 // (multiplying by Z again) to get the perspective-correct array
4658                 // values
4659                 {
4660                         __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4661                         __m128 mipedgescale, mipdensity;
4662                         attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4663                         attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4664                         attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4665                         attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4666                         attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4667                         w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4668                         w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4669                         w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4670                         attribedge1 = _mm_sub_ss(w0, w1);
4671                         attribedge2 = _mm_sub_ss(w2, w1);
4672                         attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4673                         attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4674                         x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4675                         y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4676                         attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4677                         _mm_store_ss(&triangle->w[0], attribxslope);
4678                         _mm_store_ss(&triangle->w[1], attribyslope);
4679                         _mm_store_ss(&triangle->w[2], attriborigin);
4680                         mipedgescale = _mm_setzero_ps();
4681                         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4682                         {
4683                                 __m128 attrib0, attrib1, attrib2;
4684                                 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4685                                 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4686                                         break;
4687                                 arrays += numvertices*4;
4688                                 GENATTRIBS(attrib0, attrib1, attrib2);
4689                                 attriborigin = _mm_mul_ps(attrib1, w1);
4690                                 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4691                                 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4692                                 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4693                                 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4694                                 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4695                                 _mm_stream_ps(triangle->attribs[k][0], attribxslope);
4696                                 _mm_stream_ps(triangle->attribs[k][1], attribyslope);
4697                                 _mm_stream_ps(triangle->attribs[k][2], attriborigin);
4698                                 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4699                                 {
4700                                         mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4701                                         mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4702                                         mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4703                                         mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4704                                 }
4705                         }
4706
4707                         memset(triangle->mip, 0, sizeof(triangle->mip));
4708                         for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4709                         {
4710                                 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4711                                 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4712                                         break;
4713                                 texture = thread->texbound[texunit];
4714                                 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4715                                 {
4716                                         mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4717                                         mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4718                                         mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4719                                         mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4720                                         // this will be multiplied in the texturing routine by the texture resolution
4721                                         y = _mm_cvtss_si32(mipdensity);
4722                                         if (y > 0)
4723                                         {
4724                                                 y = (int)(log((float)y)*0.5f/M_LN2);
4725                                                 if (y > texture->mipmaps - 1)
4726                                                         y = texture->mipmaps - 1;
4727                                                 triangle->mip[texunit] = y;
4728                                         }
4729                                 }
4730                         }
4731                 }
4732         
4733                 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
4734                 for (; y < bandy;)
4735                 {
4736                         __m128 xcoords, xslope;
4737                         __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
4738                         int yccmask = _mm_movemask_epi8(ycc);
4739                         int edge0p, edge0n, edge1p, edge1n;
4740                         int nexty;
4741                         if (numpoints == 4)
4742                         {
4743                                 switch(yccmask)
4744                                 {
4745                                 default:
4746                                 case 0xFFFF: /*0000*/ y = endy; continue;
4747                                 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
4748                                 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4749                                 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
4750                                 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
4751                                 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
4752                                 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
4753                                 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
4754                                 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
4755                                 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
4756                                 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
4757                                 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
4758                                 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
4759                                 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4760                                 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
4761                                 case 0x0000: /*1111*/ y++; continue;
4762                                 }
4763                         }
4764                         else
4765                         {
4766                                 switch(yccmask)
4767                                 {
4768                                 default:
4769                                 case 0xFFFF: /*000*/ y = endy; continue;
4770                                 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
4771                                 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4772                                 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
4773                                 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
4774                                 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4775                                 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
4776                                 case 0x0000: /*111*/ y++; continue;
4777                                 }
4778                         }
4779                         ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
4780                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
4781                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
4782                         nexty = _mm_extract_epi16(ycc, 0);
4783                         if (nexty >= bandy) nexty = bandy-1;
4784                         xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
4785                         xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
4786                         xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
4787                                                                 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
4788                         xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
4789                         if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
4790                         {
4791                                 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
4792                                 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
4793                         }
4794                         for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
4795                         {
4796                                 int startx, endx, offset;
4797                                 startx = _mm_cvtss_si32(xcoords);
4798                                 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
4799                                 if (startx < minx) 
4800                                 {
4801                                         if (startx < 0) startx = 0;
4802                                         startx += (minx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
4803                                 }
4804                                 if (endx > maxx) endx = maxx;
4805                                 if (startx >= endx) continue;
4806                                 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
4807                                 {
4808                                         DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
4809                                         span->triangle = thread->numtriangles;
4810                                         span->x = offset;
4811                                         span->y = y;
4812                                         span->startx = max(minx - offset, 0);
4813                                         span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
4814                                         if (span->startx >= span->endx)
4815                                                 continue; 
4816                                         if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
4817                                                 DPSOFTRAST_Draw_ProcessSpans(thread);
4818                                 }
4819                         }
4820                 }
4821
4822                 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
4823                 {
4824                         DPSOFTRAST_Draw_ProcessSpans(thread);
4825                         thread->numtriangles = 0;
4826                 }
4827         }
4828
4829         if (!ATOMIC_DECREMENT(command->refcount))
4830         {
4831                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4832                         MM_FREE(command->arrays);
4833         }
4834
4835         if (thread->numspans > 0 || thread->numtriangles > 0)
4836         {
4837                 DPSOFTRAST_Draw_ProcessSpans(thread);
4838                 thread->numtriangles = 0;
4839         }
4840 #endif
4841 }
4842
4843 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4844 {
4845         int i;
4846         int j;
4847         int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
4848         int datasize = 2*numvertices*sizeof(float[4]);
4849         DPSOFTRAST_Command_Draw *command;
4850         unsigned char *data;
4851         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4852         {
4853                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4854                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4855                         break;
4856                 datasize += numvertices*sizeof(float[4]);
4857         }
4858         if (element3s)
4859                 datasize += numtriangles*sizeof(unsigned short[3]);
4860         else if (element3i)
4861                 datasize += numtriangles*sizeof(int[3]);
4862         datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
4863         if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
4864         {
4865                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
4866                 data = (unsigned char *)MM_CALLOC(datasize, 1);
4867         }
4868         else
4869         {
4870                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
4871                 data = (unsigned char *)command + commandsize;
4872         }
4873         command->firstvertex = firstvertex;
4874         command->numvertices = numvertices;
4875         command->numtriangles = numtriangles;
4876         command->arrays = (float *)data;
4877         memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
4878         dpsoftrast.firstvertex = firstvertex;
4879         dpsoftrast.numvertices = numvertices;
4880         dpsoftrast.screencoord4f = (float *)data;
4881         data += numvertices*sizeof(float[4]);
4882         dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
4883         data += numvertices*sizeof(float[4]);
4884         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4885         {
4886                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4887                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4888                         break;
4889                 dpsoftrast.post_array4f[j] = (float *)data;
4890                 data += numvertices*sizeof(float[4]);
4891         }
4892         command->element3i = NULL;
4893         command->element3s = NULL;
4894         if (element3s)
4895         {
4896                 command->element3s = (unsigned short *)data;
4897                 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
4898         }
4899         else if (element3i)
4900         {
4901                 command->element3i = (int *)data;
4902                 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
4903         }
4904         return command;
4905 }
4906
4907 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4908 {
4909         DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
4910         DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
4911         command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
4912         command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
4913         if (command->starty >= command->endy)
4914         {
4915                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4916                         MM_FREE(command->arrays);
4917                 DPSOFTRAST_UndoCommand(command->commandsize);
4918                 return;
4919         }
4920         command->clipped = dpsoftrast.drawclipped;
4921         command->refcount = dpsoftrast.numthreads;
4922
4923         if (dpsoftrast.usethreads)
4924         {
4925                 int i;
4926                 DPSOFTRAST_Draw_SyncCommands();
4927                 for (i = 0; i < dpsoftrast.numthreads; i++)
4928                 {
4929                         DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
4930                         if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
4931                                 Thread_CondSignal(thread->drawcond);
4932                 }
4933         }
4934         else
4935         {
4936                 DPSOFTRAST_Draw_FlushThreads();
4937         }
4938 }
4939  
4940 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
4941 {
4942         int commandoffset = thread->commandoffset;
4943         while (commandoffset != endoffset)
4944         {
4945                 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
4946                 switch (command->opcode)
4947                 {
4948 #define INTERPCOMMAND(name) \
4949                 case DPSOFTRAST_OPCODE_##name : \
4950                         DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
4951                         commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
4952                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
4953                                 commandoffset = 0; \
4954                         break;
4955                 INTERPCOMMAND(Viewport)
4956                 INTERPCOMMAND(ClearColor)
4957                 INTERPCOMMAND(ClearDepth)
4958                 INTERPCOMMAND(ColorMask)
4959                 INTERPCOMMAND(DepthTest)
4960                 INTERPCOMMAND(ScissorTest)
4961                 INTERPCOMMAND(Scissor)
4962                 INTERPCOMMAND(BlendFunc)
4963                 INTERPCOMMAND(BlendSubtract)
4964                 INTERPCOMMAND(DepthMask)
4965                 INTERPCOMMAND(DepthFunc)
4966                 INTERPCOMMAND(DepthRange)
4967                 INTERPCOMMAND(PolygonOffset)
4968                 INTERPCOMMAND(CullFace)
4969                 INTERPCOMMAND(AlphaTest)
4970                 INTERPCOMMAND(AlphaFunc)
4971                 INTERPCOMMAND(SetTexture)
4972                 INTERPCOMMAND(SetShader)
4973                 INTERPCOMMAND(Uniform4f)
4974                 INTERPCOMMAND(UniformMatrix4f)
4975                 INTERPCOMMAND(Uniform1i)
4976
4977                 case DPSOFTRAST_OPCODE_Draw:
4978                         DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
4979                         commandoffset += command->commandsize;
4980                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
4981                                 commandoffset = 0;
4982                         thread->commandoffset = commandoffset;
4983                         break;
4984
4985                 case DPSOFTRAST_OPCODE_Reset:
4986                         commandoffset = 0;
4987                         break;
4988                 }
4989         }
4990         thread->commandoffset = commandoffset;
4991 }
4992
4993 static int DPSOFTRAST_Draw_Thread(void *data)
4994 {
4995         DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
4996         while(thread->index >= 0)
4997         {
4998                 if (thread->commandoffset != dpsoftrast.drawcommand)
4999                 {
5000                         DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);      
5001                 }
5002                 else 
5003                 {
5004                         Thread_LockMutex(thread->drawmutex);
5005                         if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5006                         {
5007                                 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5008                                 thread->starving = true;
5009                                 Thread_CondWait(thread->drawcond, thread->drawmutex);
5010                                 thread->starving = false;
5011                         }
5012                         Thread_UnlockMutex(thread->drawmutex);
5013                 }
5014         }   
5015         return 0;
5016 }
5017
5018 static void DPSOFTRAST_Draw_FlushThreads(void)
5019 {
5020         DPSOFTRAST_State_Thread *thread;
5021         int i;
5022         DPSOFTRAST_Draw_SyncCommands();
5023         if (dpsoftrast.usethreads) 
5024         {
5025                 for (i = 0; i < dpsoftrast.numthreads; i++)
5026                 {
5027                         thread = &dpsoftrast.threads[i];
5028                         if (thread->commandoffset != dpsoftrast.drawcommand)
5029                         {
5030                                 Thread_LockMutex(thread->drawmutex);
5031                                 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5032                                         Thread_CondSignal(thread->drawcond);
5033                                 Thread_UnlockMutex(thread->drawmutex);
5034                         }
5035                 }
5036                 for (i = 0; i < dpsoftrast.numthreads; i++)
5037                 {
5038                         thread = &dpsoftrast.threads[i];
5039                         if (thread->commandoffset != dpsoftrast.drawcommand)
5040                         {
5041                                 Thread_LockMutex(thread->drawmutex);
5042                                 if (thread->commandoffset != dpsoftrast.drawcommand)
5043                                 {
5044                                         thread->waiting = true;
5045                                         Thread_CondWait(thread->waitcond, thread->drawmutex);
5046                                         thread->waiting = false;
5047                                 }
5048                                 Thread_UnlockMutex(thread->drawmutex);
5049                         }
5050                 }
5051         }
5052         else
5053         {
5054                 for (i = 0; i < dpsoftrast.numthreads; i++)
5055                 {
5056                         thread = &dpsoftrast.threads[i];
5057                         if (thread->commandoffset != dpsoftrast.drawcommand)
5058                                 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5059                 }
5060         }
5061         dpsoftrast.commandpool.usedcommands = 0;
5062 }
5063
5064 void DPSOFTRAST_Flush(void)
5065 {
5066         DPSOFTRAST_Draw_FlushThreads();
5067 }
5068
5069 void DPSOFTRAST_Finish(void)
5070 {
5071         DPSOFTRAST_Flush();
5072 }
5073
5074 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5075 {
5076         int i;
5077         union
5078         {
5079                 int i;
5080                 unsigned char b[4];
5081         }
5082         u;
5083         u.i = 1;
5084         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5085         dpsoftrast.bigendian = u.b[3];
5086         dpsoftrast.fb_width = width;
5087         dpsoftrast.fb_height = height;
5088         dpsoftrast.fb_depthpixels = depthpixels;
5089         dpsoftrast.fb_colorpixels[0] = colorpixels;
5090         dpsoftrast.fb_colorpixels[1] = NULL;
5091         dpsoftrast.fb_colorpixels[1] = NULL;
5092         dpsoftrast.fb_colorpixels[1] = NULL;
5093         dpsoftrast.viewport[0] = 0;
5094         dpsoftrast.viewport[1] = 0;
5095         dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5096         dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5097         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5098         dpsoftrast.texture_firstfree = 1;
5099         dpsoftrast.texture_end = 1;
5100         dpsoftrast.texture_max = 0;
5101         dpsoftrast.color[0] = 1;
5102         dpsoftrast.color[1] = 1;
5103         dpsoftrast.color[2] = 1;
5104         dpsoftrast.color[3] = 1;
5105         dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5106         dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5107         dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5108         dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5109         for (i = 0; i < dpsoftrast.numthreads; i++)
5110         {
5111                 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5112                 thread->index = i;
5113                 thread->cullface = GL_BACK;
5114                 thread->colormask[1] = 1;
5115                 thread->colormask[2] = 1;
5116                 thread->colormask[3] = 1;
5117                 thread->blendfunc[0] = GL_ONE;
5118                 thread->blendfunc[1] = GL_ZERO;
5119                 thread->depthmask = true;
5120                 thread->depthtest = true;
5121                 thread->depthfunc = GL_LEQUAL;
5122                 thread->scissortest = false;
5123                 thread->alphatest = false;
5124                 thread->alphafunc = GL_GREATER;
5125                 thread->alphavalue = 0.5f;
5126                 thread->viewport[0] = 0;
5127                 thread->viewport[1] = 0;
5128                 thread->viewport[2] = dpsoftrast.fb_width;
5129                 thread->viewport[3] = dpsoftrast.fb_height;
5130                 thread->scissor[0] = 0;
5131                 thread->scissor[1] = 0;
5132                 thread->scissor[2] = dpsoftrast.fb_width;
5133                 thread->scissor[3] = dpsoftrast.fb_height;
5134                 thread->depthrange[0] = 0;
5135                 thread->depthrange[1] = 1;
5136                 thread->polygonoffset[0] = 0;
5137                 thread->polygonoffset[1] = 0;
5138         
5139                 if (dpsoftrast.interlace)
5140                 {
5141                         thread->miny1 = (i*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5142                         thread->maxy1 = ((i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5143                         thread->miny2 = ((dpsoftrast.numthreads+i)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5144                         thread->maxy2 = ((dpsoftrast.numthreads+i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5145                 }
5146                 else
5147                 {
5148                         thread->miny1 = thread->miny2 = (i*dpsoftrast.fb_height)/dpsoftrast.numthreads;
5149                         thread->maxy1 = thread->maxy2 = ((i+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
5150                 }
5151
5152                 thread->numspans = 0;
5153                 thread->numtriangles = 0;
5154                 thread->commandoffset = 0;
5155                 thread->waiting = false;
5156                 thread->starving = false;
5157            
5158                 thread->validate = -1;
5159                 DPSOFTRAST_Validate(thread, -1);
5160  
5161                 if (dpsoftrast.usethreads)
5162                 {
5163                         thread->waitcond = Thread_CreateCond();
5164                         thread->drawcond = Thread_CreateCond();
5165                         thread->drawmutex = Thread_CreateMutex();
5166                         thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5167                 }
5168         }
5169         return 0;
5170 }
5171
5172 void DPSOFTRAST_Shutdown(void)
5173 {
5174         int i;
5175         if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5176         {
5177                 DPSOFTRAST_State_Thread *thread;
5178                 for (i = 0; i < dpsoftrast.numthreads; i++)
5179                 {
5180                         thread = &dpsoftrast.threads[i];
5181                         Thread_LockMutex(thread->drawmutex);
5182                         thread->index = -1;
5183                         Thread_CondSignal(thread->drawcond);
5184                         Thread_UnlockMutex(thread->drawmutex);
5185                         Thread_WaitThread(thread->thread, 0);
5186                         Thread_DestroyCond(thread->waitcond);
5187                         Thread_DestroyCond(thread->drawcond);
5188                         Thread_DestroyMutex(thread->drawmutex);
5189                 }
5190         }
5191         for (i = 0;i < dpsoftrast.texture_end;i++)
5192                 if (dpsoftrast.texture[i].bytes)
5193                         MM_FREE(dpsoftrast.texture[i].bytes);
5194         if (dpsoftrast.texture)
5195                 free(dpsoftrast.texture);
5196         if (dpsoftrast.threads)
5197                 MM_FREE(dpsoftrast.threads);
5198         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5199 }
5200