]> git.xonotic.org Git - xonotic/darkplaces.git/blob - dpsoftrast.c
fb validation fix
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "thread.h"
7 #include "dpsoftrast.h"
8
9 #ifndef __cplusplus
10 typedef qboolean bool;
11 #endif
12
13 #define ALIGN_SIZE 16
14 #define ATOMIC_SIZE 32
15
16 #ifdef SSE2_PRESENT
17         #if defined(__APPLE__)
18                 #include <libkern/OSAtomic.h>
19                 #define ALIGN(var) var __attribute__((__aligned__(16)))
20                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
21                 #define MEMORY_BARRIER (_mm_sfence())
22                 #define ATOMIC_COUNTER volatile int32_t 
23                 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
24                 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
25                 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
26         #elif defined(__GNUC__)
27                 #define ALIGN(var) var __attribute__((__aligned__(16)))
28                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
29                 #define MEMORY_BARRIER (_mm_sfence())
30                 //(__sync_synchronize())
31                 #define ATOMIC_COUNTER volatile int
32                 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
33                 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
34                 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
35         #elif defined(_MSC_VER)
36                 #define ALIGN(var) __declspec(align(16)) var
37                 #define ATOMIC(var) __declspec(align(32)) var
38                 #define MEMORY_BARRIER (_mm_sfence())
39                 //(MemoryBarrier())
40                 #define ATOMIC_COUNTER volatile LONG
41                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
42                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
43                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
44         #endif
45 #endif
46
47 #ifndef ALIGN
48 #define ALIGN(var) var
49 #endif
50 #ifndef ATOMIC
51 #define ATOMIC(var) var
52 #endif
53 #ifndef MEMORY_BARRIER
54 #define MEMORY_BARRIER ((void)0)
55 #endif
56 #ifndef ATOMIC_COUNTER
57 #define ATOMIC_COUNTER int
58 #endif
59 #ifndef ATOMIC_INCREMENT
60 #define ATOMIC_INCREMENT(counter) (++(counter))
61 #endif
62 #ifndef ATOMIC_DECREMENT
63 #define ATOMIC_DECREMENT(counter) (--(counter))
64 #endif
65 #ifndef ATOMIC_ADD
66 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
67 #endif
68
69 #ifdef SSE2_PRESENT
70 #include <emmintrin.h>
71
72 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
73
74 static void *MM_CALLOC(size_t nmemb, size_t size)
75 {
76         void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
77         if (ptr != NULL) memset(ptr, 0, nmemb*size);
78         return ptr;
79 }
80
81 #define MM_FREE _mm_free
82 #else
83 #define MM_MALLOC(size) malloc(size)
84 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
85 #define MM_FREE free
86 #endif
87
88 typedef enum DPSOFTRAST_ARRAY_e
89 {
90         DPSOFTRAST_ARRAY_POSITION,
91         DPSOFTRAST_ARRAY_COLOR,
92         DPSOFTRAST_ARRAY_TEXCOORD0,
93         DPSOFTRAST_ARRAY_TEXCOORD1,
94         DPSOFTRAST_ARRAY_TEXCOORD2,
95         DPSOFTRAST_ARRAY_TEXCOORD3,
96         DPSOFTRAST_ARRAY_TEXCOORD4,
97         DPSOFTRAST_ARRAY_TEXCOORD5,
98         DPSOFTRAST_ARRAY_TEXCOORD6,
99         DPSOFTRAST_ARRAY_TEXCOORD7,
100         DPSOFTRAST_ARRAY_TOTAL
101 }
102 DPSOFTRAST_ARRAY;
103
104 typedef struct DPSOFTRAST_Texture_s
105 {
106         int flags;
107         int width;
108         int height;
109         int depth;
110         int sides;
111         DPSOFTRAST_TEXTURE_FILTER filter;
112         int mipmaps;
113         int size;
114         ATOMIC_COUNTER binds;
115         unsigned char *bytes;
116         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
117 }
118 DPSOFTRAST_Texture;
119
120 #define COMMAND_SIZE ALIGN_SIZE
121 #define COMMAND_ALIGN(var) ALIGN(var)
122
123 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
124 {
125         unsigned char opcode;
126         unsigned short commandsize;
127 }
128 DPSOFTRAST_Command);
129
130 enum { DPSOFTRAST_OPCODE_Reset = 0 };
131
132 #define DEFCOMMAND(opcodeval, name, fields) \
133         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
134         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
135         { \
136                 unsigned char opcode; \
137                 unsigned short commandsize; \
138                 fields \
139         } DPSOFTRAST_Command_##name );
140
141 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
142 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
143
144 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
145 {
146         int freecommand;
147         int usedcommands;
148         ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
149 }
150 DPSOFTRAST_State_Command_Pool);
151
152 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
153 {
154         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
155         float w[3];
156         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
157 }
158 DPSOFTRAST_State_Triangle);
159
160 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
161         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
162         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
163                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
164                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
165 }
166 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
167         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
168         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
169         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
170         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
171         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
172         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
173         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
174         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
175 }
176                                         
177 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
178
179 typedef ALIGN(struct DPSOFTRAST_State_Span_s
180 {
181         int triangle; // triangle this span was generated by
182         int x; // framebuffer x coord
183         int y; // framebuffer y coord
184         int startx; // usable range (according to pixelmask)
185         int endx; // usable range (according to pixelmask)
186         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
187 }
188 DPSOFTRAST_State_Span);
189
190 #define DPSOFTRAST_DRAW_MAXSPANS 1024
191 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
192
193 #define DPSOFTRAST_VALIDATE_FB 1
194 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
195 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
196 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
197
198 typedef enum DPSOFTRAST_BLENDMODE_e
199 {
200         DPSOFTRAST_BLENDMODE_OPAQUE,
201         DPSOFTRAST_BLENDMODE_ALPHA,
202         DPSOFTRAST_BLENDMODE_ADDALPHA,
203         DPSOFTRAST_BLENDMODE_ADD,
204         DPSOFTRAST_BLENDMODE_INVMOD,
205         DPSOFTRAST_BLENDMODE_MUL,
206         DPSOFTRAST_BLENDMODE_MUL2,
207         DPSOFTRAST_BLENDMODE_SUBALPHA,
208         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
209         DPSOFTRAST_BLENDMODE_INVADD,
210         DPSOFTRAST_BLENDMODE_TOTAL
211 }
212 DPSOFTRAST_BLENDMODE;
213
214 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
215 {
216         void *thread;
217         int index;
218         
219         int cullface;
220         int colormask[4];
221         int blendfunc[2];
222         int blendsubtract;
223         int depthmask;
224         int depthtest;
225         int depthfunc;
226         int scissortest;
227         int alphatest;
228         int alphafunc;
229         float alphavalue;
230         int viewport[4];
231         int scissor[4];
232         float depthrange[2];
233         float polygonoffset[2];
234
235         int shader_mode;
236         int shader_permutation;
237         int shader_exactspecularmath;
238
239         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
240         
241         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
242         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
243
244         // DPSOFTRAST_VALIDATE_ flags
245         int validate;
246
247         // derived values (DPSOFTRAST_VALIDATE_FB)
248         int fb_colormask;
249         int fb_scissor[4];
250         ALIGN(float fb_viewportcenter[4]);
251         ALIGN(float fb_viewportscale[4]);
252
253         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
254         int fb_depthfunc;
255
256         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
257         int fb_blendmode;
258
259         // band boundaries
260         int miny1;
261         int maxy1;
262         int miny2;
263         int maxy2;
264
265         ATOMIC(volatile int commandoffset);
266
267         volatile bool waiting;
268         volatile bool starving;
269         void *waitcond;
270         void *drawcond;
271         void *drawmutex;
272
273         int numspans;
274         int numtriangles;
275         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
276         DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
277 }
278 DPSOFTRAST_State_Thread);
279
280 typedef ATOMIC(struct DPSOFTRAST_State_s
281 {
282         int fb_width;
283         int fb_height;
284         unsigned int *fb_depthpixels;
285         unsigned int *fb_colorpixels[4];
286
287         int viewport[4];
288         ALIGN(float fb_viewportcenter[4]);
289         ALIGN(float fb_viewportscale[4]);
290
291         float color[4];
292         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
293         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
294
295         const float *pointer_vertex3f;
296         const float *pointer_color4f;
297         const unsigned char *pointer_color4ub;
298         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
299         int stride_vertex;
300         int stride_color;
301         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
302         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
303         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
304
305         int firstvertex;
306         int numvertices;
307         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
308         float *screencoord4f;
309         int drawstarty;
310         int drawendy;
311         int drawclipped;
312         
313         int shader_mode;
314         int shader_permutation;
315         int shader_exactspecularmath;
316
317         int texture_max;
318         int texture_end;
319         int texture_firstfree;
320         DPSOFTRAST_Texture *texture;
321
322         int bigendian;
323
324         // error reporting
325         const char *errorstring;
326
327         bool usethreads;
328         int interlace;
329         int numthreads;
330         DPSOFTRAST_State_Thread *threads;
331
332         ATOMIC(volatile int drawcommand);
333
334         DPSOFTRAST_State_Command_Pool commandpool;
335 }
336 DPSOFTRAST_State);
337
338 DPSOFTRAST_State dpsoftrast;
339
340 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
341 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
342 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
343 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
344 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
345
346 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
347 {
348         fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
349         fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
350         fb_viewportcenter[3] = 0.5f;
351         fb_viewportcenter[0] = 0.0f;
352         fb_viewportscale[1] = 0.5f * viewport[2];
353         fb_viewportscale[2] = -0.5f * viewport[3];
354         fb_viewportscale[3] = 0.5f;
355         fb_viewportscale[0] = 1.0f;
356 }
357
358 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
359 {
360         if (dpsoftrast.interlace)
361         {
362                 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
363                 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
364                 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
365                 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
366         }
367         else
368         {
369                 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
370                 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
371         }
372 }
373
374 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
375 {
376         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
377         // and viewport projection values
378         int x1, x2;
379         int y1, y2;
380         x1 = thread->scissor[0];
381         x2 = thread->scissor[0] + thread->scissor[2];
382         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
383         y2 = dpsoftrast.fb_height - thread->scissor[1];
384         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
385         if (x1 < 0) x1 = 0;
386         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
387         if (y1 < 0) y1 = 0;
388         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
389         thread->fb_scissor[0] = x1;
390         thread->fb_scissor[1] = y1;
391         thread->fb_scissor[2] = x2 - x1;
392         thread->fb_scissor[3] = y2 - y1;
393
394         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
395         DPSOFTRAST_RecalcThread(thread);
396 }
397
398 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
399 {
400         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
401 }
402
403 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
404 {
405         if (thread->blendsubtract)
406         {
407                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
408                 {
409                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
410                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
411                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
412                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
413                 }
414         }
415         else
416         {       
417                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
418                 {
419                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
420                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
421                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
422                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
423                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
424                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
425                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
426                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
427                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
428                 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
429                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
430                 }
431         }
432 }
433
434 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
435
436 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
437 {
438         mask &= thread->validate;
439         if (!mask)
440                 return;
441         if (mask & DPSOFTRAST_VALIDATE_FB)
442         {
443                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
444                 DPSOFTRAST_RecalcFB(thread);
445         }
446         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
447         {
448                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
449                 DPSOFTRAST_RecalcDepthFunc(thread);
450         }
451         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
452         {
453                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
454                 DPSOFTRAST_RecalcBlendFunc(thread);
455         }
456 }
457
458 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
459 {
460         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
461                 return &dpsoftrast.texture[index];
462         return NULL;
463 }
464
465 static void DPSOFTRAST_Texture_Grow(void)
466 {
467         DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
468         DPSOFTRAST_State_Thread *thread;
469         int i;
470         int j;
471         DPSOFTRAST_Flush();
472         // expand texture array as needed
473         if (dpsoftrast.texture_max < 1024)
474                 dpsoftrast.texture_max = 1024;
475         else
476                 dpsoftrast.texture_max *= 2;
477         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
478         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
479                 if (dpsoftrast.texbound[i])
480                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
481         for (j = 0; j < dpsoftrast.numthreads; j++)
482         {
483                 thread = &dpsoftrast.threads[j];
484                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
485                         if (thread->texbound[i])
486                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
487         }
488 }
489
490 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
491 {
492         int w;
493         int h;
494         int d;
495         int size;
496         int s;
497         int texnum;
498         int mipmaps;
499         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
500         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
501         DPSOFTRAST_Texture *texture;
502         if (width*height*depth < 1)
503         {
504                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
505                 return 0;
506         }
507         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
508         {
509                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
510                 return 0;
511         }
512         switch(texformat)
513         {
514         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
515         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
516         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
517                 break;
518         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
519                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
520                 {
521                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
522                         return 0;
523                 }
524                 if (depth != 1)
525                 {
526                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
527                         return 0;
528                 }
529                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
530                 {
531                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
532                         return 0;
533                 }
534                 break;
535         }
536         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
537         {
538                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
539                 return 0;
540         }
541         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
542         {
543                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
544                 return 0;
545         }
546         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
547         {
548                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
549                 return 0;
550         }
551         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
552         {
553                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
554                 return 0;
555         }
556         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
557         {
558                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
559                 return 0;
560         }
561         // find first empty slot in texture array
562         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
563                 if (!dpsoftrast.texture[texnum].bytes)
564                         break;
565         dpsoftrast.texture_firstfree = texnum + 1;
566         if (dpsoftrast.texture_max <= texnum)
567                 DPSOFTRAST_Texture_Grow();
568         if (dpsoftrast.texture_end <= texnum)
569                 dpsoftrast.texture_end = texnum + 1;
570         texture = &dpsoftrast.texture[texnum];
571         memset(texture, 0, sizeof(*texture));
572         texture->flags = flags;
573         texture->width = width;
574         texture->height = height;
575         texture->depth = depth;
576         texture->sides = sides;
577         texture->binds = 0;
578         w = width;
579         h = height;
580         d = depth;
581         size = 0;
582         mipmaps = 0;
583         w = width;
584         h = height;
585         d = depth;
586         for (;;)
587         {
588                 s = w * h * d * sides * 4;
589                 texture->mipmap[mipmaps][0] = size;
590                 texture->mipmap[mipmaps][1] = s;
591                 texture->mipmap[mipmaps][2] = w;
592                 texture->mipmap[mipmaps][3] = h;
593                 texture->mipmap[mipmaps][4] = d;
594                 size += s;
595                 mipmaps++;
596                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
597                         break;
598                 if (w > 1) w >>= 1;
599                 if (h > 1) h >>= 1;
600                 if (d > 1) d >>= 1;
601         }
602         texture->mipmaps = mipmaps;
603         texture->size = size;
604
605         // allocate the pixels now
606         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
607
608         return texnum;
609 }
610 void DPSOFTRAST_Texture_Free(int index)
611 {
612         DPSOFTRAST_Texture *texture;
613         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
614         if (texture->binds)
615                 DPSOFTRAST_Flush();
616         if (texture->bytes)
617                 MM_FREE(texture->bytes);
618         texture->bytes = NULL;
619         memset(texture, 0, sizeof(*texture));
620         // adjust the free range and used range
621         if (dpsoftrast.texture_firstfree > index)
622                 dpsoftrast.texture_firstfree = index;
623         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
624                 dpsoftrast.texture_end--;
625 }
626 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
627 {
628         int i, x, y, z, w, layer0, layer1, row0, row1;
629         unsigned char *o, *i0, *i1, *i2, *i3;
630         DPSOFTRAST_Texture *texture;
631         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
632         if (texture->mipmaps <= 1)
633                 return;
634         for (i = 1;i < texture->mipmaps;i++)
635         {
636                 for (z = 0;z < texture->mipmap[i][4];z++)
637                 {
638                         layer0 = z*2;
639                         layer1 = z*2+1;
640                         if (layer1 >= texture->mipmap[i-1][4])
641                                 layer1 = texture->mipmap[i-1][4]-1;
642                         for (y = 0;y < texture->mipmap[i][3];y++)
643                         {
644                                 row0 = y*2;
645                                 row1 = y*2+1;
646                                 if (row1 >= texture->mipmap[i-1][3])
647                                         row1 = texture->mipmap[i-1][3]-1;
648                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
649                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
650                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
651                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
652                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
653                                 w = texture->mipmap[i][2];
654                                 if (layer1 > layer0)
655                                 {
656                                         if (texture->mipmap[i-1][2] > 1)
657                                         {
658                                                 // average 3D texture
659                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
660                                                 {
661                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
662                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
663                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
664                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
665                                                 }
666                                         }
667                                         else
668                                         {
669                                                 // average 3D mipmap with parent width == 1
670                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
671                                                 {
672                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
673                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
674                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
675                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
676                                                 }
677                                         }
678                                 }
679                                 else
680                                 {
681                                         if (texture->mipmap[i-1][2] > 1)
682                                         {
683                                                 // average 2D texture (common case)
684                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
685                                                 {
686                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
687                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
688                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
689                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
690                                                 }
691                                         }
692                                         else
693                                         {
694                                                 // 2D texture with parent width == 1
695                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
696                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
697                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
698                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
699                                         }
700                                 }
701                         }
702                 }
703         }
704 }
705 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
706 {
707         DPSOFTRAST_Texture *texture;
708         unsigned char *dst;
709         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
710         if (texture->binds)
711                 DPSOFTRAST_Flush();
712         if (pixels)
713         {
714                 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
715                 while (blockheight > 0)
716                 {
717                         memcpy(dst, pixels, blockwidth * 4);
718                         pixels += blockwidth * 4;
719                         dst += texture->mipmap[0][2] * 4;
720                         blockheight--;
721                 }
722         }
723         DPSOFTRAST_Texture_CalculateMipmaps(index);
724 }
725 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
726 {
727         DPSOFTRAST_Texture *texture;
728         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
729         if (texture->binds)
730                 DPSOFTRAST_Flush();
731         if (pixels)
732                 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
733         DPSOFTRAST_Texture_CalculateMipmaps(index);
734 }
735 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
736 {
737         DPSOFTRAST_Texture *texture;
738         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
739         return texture->mipmap[mip][2];
740 }
741 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
742 {
743         DPSOFTRAST_Texture *texture;
744         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
745         return texture->mipmap[mip][3];
746 }
747 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
748 {
749         DPSOFTRAST_Texture *texture;
750         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
751         return texture->mipmap[mip][4];
752 }
753 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
754 {
755         DPSOFTRAST_Texture *texture;
756         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
757         if (texture->binds)
758                 DPSOFTRAST_Flush();
759         return texture->bytes + texture->mipmap[mip][0];
760 }
761 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
762 {
763         DPSOFTRAST_Texture *texture;
764         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
765         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
766         {
767                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
768                 return;
769         }
770         if (texture->binds)
771                 DPSOFTRAST_Flush();
772         texture->filter = filter;
773 }
774
775 static void DPSOFTRAST_Draw_FlushThreads(void);
776
777 static void DPSOFTRAST_Draw_SyncCommands(void)
778 {
779         if(dpsoftrast.usethreads) MEMORY_BARRIER;
780         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
781 }
782
783 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
784 {
785         DPSOFTRAST_State_Thread *thread;
786         int i;
787         int freecommand = dpsoftrast.commandpool.freecommand;
788         int usedcommands = dpsoftrast.commandpool.usedcommands;
789         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
790                 return;
791         DPSOFTRAST_Draw_SyncCommands();
792         for(;;)
793         {
794                 int waitindex = -1;
795                 int commandoffset;
796                 usedcommands = 0;
797                 for (i = 0; i < dpsoftrast.numthreads; i++)
798                 {
799                         thread = &dpsoftrast.threads[i]; 
800                         commandoffset = freecommand - thread->commandoffset;
801                         if (commandoffset < 0)
802                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
803                         if (commandoffset > usedcommands)
804                         {
805                                 waitindex = i;
806                                 usedcommands = commandoffset;
807                         }
808                 }
809                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
810                         break;
811                 thread = &dpsoftrast.threads[waitindex];
812                 Thread_LockMutex(thread->drawmutex);
813                 if (thread->commandoffset != dpsoftrast.drawcommand)
814                 {
815                         thread->waiting = true;
816                         if (thread->starving) Thread_CondSignal(thread->drawcond);
817                         Thread_CondWait(thread->waitcond, thread->drawmutex);
818                         thread->waiting = false;
819                 }
820                 Thread_UnlockMutex(thread->drawmutex);
821         }
822         dpsoftrast.commandpool.usedcommands = usedcommands;
823 }
824
825 #define DPSOFTRAST_ALIGNCOMMAND(size) \
826         ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
827 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
828         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
829
830 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
831 {
832         DPSOFTRAST_Command *command;
833         int freecommand = dpsoftrast.commandpool.freecommand;
834         int usedcommands = dpsoftrast.commandpool.usedcommands;
835         int extra = sizeof(DPSOFTRAST_Command);
836         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
837                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
838         if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
839         {
840                 if (dpsoftrast.usethreads)
841                         DPSOFTRAST_Draw_FreeCommandPool(size + extra);
842                 else
843                         DPSOFTRAST_Draw_FlushThreads();
844                 freecommand = dpsoftrast.commandpool.freecommand;
845                 usedcommands = dpsoftrast.commandpool.usedcommands;
846         }
847         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
848         {
849                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
850                 command->opcode = DPSOFTRAST_OPCODE_Reset;
851                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
852                 freecommand = 0;
853         }
854         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
855         command->opcode = opcode;
856         command->commandsize = size;
857         freecommand += size;
858         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
859                 freecommand = 0;
860         dpsoftrast.commandpool.freecommand = freecommand;
861         dpsoftrast.commandpool.usedcommands = usedcommands + size;
862         return command;
863 }
864
865 static void DPSOFTRAST_UndoCommand(int size)
866 {
867         int freecommand = dpsoftrast.commandpool.freecommand;
868         int usedcommands = dpsoftrast.commandpool.usedcommands;
869         freecommand -= size;
870         if (freecommand < 0)
871                 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
872         usedcommands -= size;
873         dpsoftrast.commandpool.freecommand = freecommand;
874         dpsoftrast.commandpool.usedcommands = usedcommands;
875 }
876                 
877 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
878 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
879 {
880         thread->viewport[0] = command->x;
881         thread->viewport[1] = command->y;
882         thread->viewport[2] = command->width;
883         thread->viewport[3] = command->height;
884         thread->validate |= DPSOFTRAST_VALIDATE_FB;
885 }
886 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
887 {
888         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
889         command->x = x;
890         command->y = y;
891         command->width = width;
892         command->height = height;
893
894         dpsoftrast.viewport[0] = x;
895         dpsoftrast.viewport[1] = y;
896         dpsoftrast.viewport[2] = width;
897         dpsoftrast.viewport[3] = height;
898         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
899 }
900
901 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
902 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
903 {
904         int i, x1, y1, x2, y2, w, h, x, y;
905         int miny1, maxy1, miny2, maxy2;
906         int bandy;
907         unsigned int *p;
908         unsigned int c;
909         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
910         miny1 = thread->miny1;
911         maxy1 = thread->maxy1;
912         miny2 = thread->miny2;
913         maxy2 = thread->maxy2;
914         x1 = thread->fb_scissor[0];
915         y1 = thread->fb_scissor[1];
916         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
917         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
918         if (y1 < miny1) y1 = miny1;
919         if (y2 > maxy2) y2 = maxy2;
920         w = x2 - x1;
921         h = y2 - y1;
922         if (w < 1 || h < 1)
923                 return;
924         // FIXME: honor fb_colormask?
925         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
926         for (i = 0;i < 4;i++)
927         {
928                 if (!dpsoftrast.fb_colorpixels[i])
929                         continue;
930                 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
931                 for (;y < bandy;y++)
932                 {
933                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
934                         for (x = x1;x < x2;x++)
935                                 p[x] = c;
936                 }
937         }
938 }
939 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
940 {
941         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
942         command->r = r;
943         command->g = g;
944         command->b = b;
945         command->a = a;
946 }
947
948 DEFCOMMAND(3, ClearDepth, float depth;)
949 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
950 {
951         int x1, y1, x2, y2, w, h, x, y;
952         int miny1, maxy1, miny2, maxy2;
953         int bandy;
954         unsigned int *p;
955         unsigned int c;
956         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
957         miny1 = thread->miny1;
958         maxy1 = thread->maxy1;
959         miny2 = thread->miny2;
960         maxy2 = thread->maxy2;
961         x1 = thread->fb_scissor[0];
962         y1 = thread->fb_scissor[1];
963         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
964         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
965         if (y1 < miny1) y1 = miny1;
966         if (y2 > maxy2) y2 = maxy2;
967         w = x2 - x1;
968         h = y2 - y1;
969         if (w < 1 || h < 1)
970                 return;
971         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
972         for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
973         for (;y < bandy;y++)
974         {
975                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
976                 for (x = x1;x < x2;x++)
977                         p[x] = c;
978         }
979 }
980 void DPSOFTRAST_ClearDepth(float d)
981 {
982         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
983         command->depth = d;
984 }
985
986 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
987 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
988 {
989         thread->colormask[0] = command->r != 0;
990         thread->colormask[1] = command->g != 0;
991         thread->colormask[2] = command->b != 0;
992         thread->colormask[3] = command->a != 0;
993         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
994 }
995 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
996 {
997         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
998         command->r = r;
999         command->g = g;
1000         command->b = b;
1001         command->a = a;
1002 }
1003
1004 DEFCOMMAND(5, DepthTest, int enable;)
1005 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1006 {
1007         thread->depthtest = command->enable;
1008         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1009 }
1010 void DPSOFTRAST_DepthTest(int enable)
1011 {
1012         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1013         command->enable = enable;
1014 }
1015
1016 DEFCOMMAND(6, ScissorTest, int enable;)
1017 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1018 {
1019         thread->scissortest = command->enable;
1020         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1021 }
1022 void DPSOFTRAST_ScissorTest(int enable)
1023 {
1024         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1025         command->enable = enable;
1026 }
1027
1028 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1029 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1030 {
1031         thread->scissor[0] = command->x;
1032         thread->scissor[1] = command->y;
1033         thread->scissor[2] = command->width;
1034         thread->scissor[3] = command->height;
1035         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1036 }
1037 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1038 {
1039         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1040         command->x = x;
1041         command->y = y;
1042         command->width = width;
1043         command->height = height;
1044 }
1045
1046 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1047 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1048 {
1049         thread->blendfunc[0] = command->sfactor;
1050         thread->blendfunc[1] = command->dfactor;
1051         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1052 }
1053 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1054 {
1055         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1056         command->sfactor = sfactor;
1057         command->dfactor = dfactor;
1058 }
1059
1060 DEFCOMMAND(9, BlendSubtract, int enable;)
1061 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1062 {
1063         thread->blendsubtract = command->enable;
1064         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1065 }
1066 void DPSOFTRAST_BlendSubtract(int enable)
1067 {
1068         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1069         command->enable = enable;
1070 }
1071
1072 DEFCOMMAND(10, DepthMask, int enable;)
1073 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1074 {
1075         thread->depthmask = command->enable;
1076 }
1077 void DPSOFTRAST_DepthMask(int enable)
1078 {
1079         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1080         command->enable = enable;
1081 }
1082
1083 DEFCOMMAND(11, DepthFunc, int func;)
1084 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1085 {
1086         thread->depthfunc = command->func;
1087 }
1088 void DPSOFTRAST_DepthFunc(int func)
1089 {
1090         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1091         command->func = func;
1092 }
1093
1094 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1095 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1096 {
1097         thread->depthrange[0] = command->nearval;
1098         thread->depthrange[1] = command->farval;
1099 }
1100 void DPSOFTRAST_DepthRange(float nearval, float farval)
1101 {
1102         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1103         command->nearval = nearval;
1104         command->farval = farval;
1105 }
1106
1107 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1108 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1109 {
1110         thread->polygonoffset[0] = command->alongnormal;
1111         thread->polygonoffset[1] = command->intoview;
1112 }
1113 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1114 {
1115         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1116         command->alongnormal = alongnormal;
1117         command->intoview = intoview;
1118 }
1119
1120 DEFCOMMAND(14, CullFace, int mode;)
1121 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1122 {
1123         thread->cullface = command->mode;
1124 }
1125 void DPSOFTRAST_CullFace(int mode)
1126 {
1127         DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1128         command->mode = mode;
1129 }
1130
1131 DEFCOMMAND(15, AlphaTest, int enable;)
1132 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1133 {
1134         thread->alphatest = command->enable;
1135 }
1136 void DPSOFTRAST_AlphaTest(int enable)
1137 {
1138         DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1139         command->enable = enable;
1140 }
1141
1142 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1143 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1144 {
1145         thread->alphafunc = command->func;
1146         thread->alphavalue = command->ref;
1147 }
1148 void DPSOFTRAST_AlphaFunc(int func, float ref)
1149 {
1150         DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1151         command->func = func;
1152         command->ref = ref;
1153 }
1154
1155 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1156 {
1157         dpsoftrast.color[0] = r;
1158         dpsoftrast.color[1] = g;
1159         dpsoftrast.color[2] = b;
1160         dpsoftrast.color[3] = a;
1161 }
1162
1163 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1164 {
1165         int outstride = blockwidth * 4;
1166         int instride = dpsoftrast.fb_width * 4;
1167         int bx1 = blockx;
1168         int by1 = blocky;
1169         int bx2 = blockx + blockwidth;
1170         int by2 = blocky + blockheight;
1171         int bw;
1172         int x;
1173         int y;
1174         unsigned char *inpixels;
1175         unsigned char *b;
1176         unsigned char *o;
1177         DPSOFTRAST_Flush();
1178         if (bx1 < 0) bx1 = 0;
1179         if (by1 < 0) by1 = 0;
1180         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1181         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1182         bw = bx2 - bx1;
1183         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1184         if (dpsoftrast.bigendian)
1185         {
1186                 for (y = by1;y < by2;y++)
1187                 {
1188                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1189                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1190                         for (x = bx1;x < bx2;x++)
1191                         {
1192                                 o[0] = b[3];
1193                                 o[1] = b[2];
1194                                 o[2] = b[1];
1195                                 o[3] = b[0];
1196                                 o += 4;
1197                                 b += 4;
1198                         }
1199                 }
1200         }
1201         else
1202         {
1203                 for (y = by1;y < by2;y++)
1204                 {
1205                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1206                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1207                         memcpy(o, b, bw*4);
1208                 }
1209         }
1210
1211 }
1212 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1213 {
1214         int tx1 = tx;
1215         int ty1 = ty;
1216         int tx2 = tx + width;
1217         int ty2 = ty + height;
1218         int sx1 = sx;
1219         int sy1 = sy;
1220         int sx2 = sx + width;
1221         int sy2 = sy + height;
1222         int swidth;
1223         int sheight;
1224         int twidth;
1225         int theight;
1226         int sw;
1227         int sh;
1228         int tw;
1229         int th;
1230         int y;
1231         unsigned int *spixels;
1232         unsigned int *tpixels;
1233         DPSOFTRAST_Texture *texture;
1234         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1235         if (mip < 0 || mip >= texture->mipmaps) return;
1236         DPSOFTRAST_Flush();
1237         spixels = dpsoftrast.fb_colorpixels[0];
1238         swidth = dpsoftrast.fb_width;
1239         sheight = dpsoftrast.fb_height;
1240         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1241         twidth = texture->mipmap[mip][2];
1242         theight = texture->mipmap[mip][3];
1243         if (tx1 < 0) tx1 = 0;
1244         if (ty1 < 0) ty1 = 0;
1245         if (tx2 > twidth) tx2 = twidth;
1246         if (ty2 > theight) ty2 = theight;
1247         if (sx1 < 0) sx1 = 0;
1248         if (sy1 < 0) sy1 = 0;
1249         if (sx2 > swidth) sx2 = swidth;
1250         if (sy2 > sheight) sy2 = sheight;
1251         tw = tx2 - tx1;
1252         th = ty2 - ty1;
1253         sw = sx2 - sx1;
1254         sh = sy2 - sy1;
1255         if (tw > sw) tw = sw;
1256         if (th > sh) th = sh;
1257         if (tw < 1 || th < 1)
1258                 return;
1259         sy1 = sheight - 1 - sy1;
1260         for (y = 0;y < th;y++)
1261                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1262         if (texture->mipmaps > 1)
1263                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1264 }
1265
1266 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1267 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1268 {
1269         if (thread->texbound[command->unitnum])
1270                 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1271         thread->texbound[command->unitnum] = command->texture;
1272 }
1273 void DPSOFTRAST_SetTexture(int unitnum, int index)
1274 {
1275         DPSOFTRAST_Command_SetTexture *command;
1276         DPSOFTRAST_Texture *texture;
1277         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1278         {
1279                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1280                 return;
1281         }
1282         texture = DPSOFTRAST_Texture_GetByIndex(index);
1283         if (index && !texture)
1284         {
1285                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1286                 return;
1287         }
1288
1289         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1290         command->unitnum = unitnum;
1291         command->texture = texture;
1292
1293         dpsoftrast.texbound[unitnum] = texture;
1294         ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1295 }
1296
1297 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1298 {
1299         dpsoftrast.pointer_vertex3f = vertex3f;
1300         dpsoftrast.stride_vertex = stride;
1301 }
1302 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1303 {
1304         dpsoftrast.pointer_color4f = color4f;
1305         dpsoftrast.pointer_color4ub = NULL;
1306         dpsoftrast.stride_color = stride;
1307 }
1308 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1309 {
1310         dpsoftrast.pointer_color4f = NULL;
1311         dpsoftrast.pointer_color4ub = color4ub;
1312         dpsoftrast.stride_color = stride;
1313 }
1314 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1315 {
1316         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1317         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1318         dpsoftrast.stride_texcoord[unitnum] = stride;
1319 }
1320
1321 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1322 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1323 {
1324         thread->shader_mode = command->mode;
1325         thread->shader_permutation = command->permutation;
1326         thread->shader_exactspecularmath = command->exactspecularmath;
1327 }
1328 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1329 {
1330         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1331         command->mode = mode;
1332         command->permutation = permutation;
1333         command->exactspecularmath = exactspecularmath;
1334
1335         dpsoftrast.shader_mode = mode;
1336         dpsoftrast.shader_permutation = permutation;
1337         dpsoftrast.shader_exactspecularmath = exactspecularmath;
1338 }
1339
1340 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1341 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1342 {
1343         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1344 }
1345 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1346 {
1347         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1348         command->index = index;
1349         command->val[0] = v0;
1350         command->val[1] = v1;
1351         command->val[2] = v2;
1352         command->val[3] = v3;
1353
1354         dpsoftrast.uniform4f[index*4+0] = v0;
1355         dpsoftrast.uniform4f[index*4+1] = v1;
1356         dpsoftrast.uniform4f[index*4+2] = v2;
1357         dpsoftrast.uniform4f[index*4+3] = v3;
1358 }
1359 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1360 {
1361         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1362         command->index = index;
1363         memcpy(command->val, v, sizeof(command->val));
1364
1365         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1366 }
1367
1368 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1369 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1370 {
1371         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1372 }
1373 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1374 {
1375 #ifdef SSE2_PRESENT
1376         int i, index;
1377         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1378         {
1379                 __m128 m0, m1, m2, m3;
1380                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1381                 command->index = (DPSOFTRAST_UNIFORM)index;
1382                 if (((size_t)v)&(ALIGN_SIZE-1))
1383                 {
1384                         m0 = _mm_loadu_ps(v);
1385                         m1 = _mm_loadu_ps(v+4);
1386                         m2 = _mm_loadu_ps(v+8);
1387                         m3 = _mm_loadu_ps(v+12);
1388                 }
1389                 else
1390                 {
1391                         m0 = _mm_load_ps(v);
1392                         m1 = _mm_load_ps(v+4);
1393                         m2 = _mm_load_ps(v+8);
1394                         m3 = _mm_load_ps(v+12);
1395                 }
1396                 if (transpose)
1397                 {
1398                         __m128 t0, t1, t2, t3;
1399                         t0 = _mm_unpacklo_ps(m0, m1);
1400                         t1 = _mm_unpacklo_ps(m2, m3);
1401                         t2 = _mm_unpackhi_ps(m0, m1);
1402                         t3 = _mm_unpackhi_ps(m2, m3);
1403                         m0 = _mm_movelh_ps(t0, t1);
1404                         m1 = _mm_movehl_ps(t1, t0);
1405                         m2 = _mm_movelh_ps(t2, t3);
1406                         m3 = _mm_movehl_ps(t3, t2);                     
1407                 }
1408                 _mm_store_ps(command->val, m0);
1409                 _mm_store_ps(command->val+4, m1);
1410                 _mm_store_ps(command->val+8, m2);
1411                 _mm_store_ps(command->val+12, m3);
1412                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1413                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1414                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1415                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1416         }
1417 #endif
1418 }
1419
1420 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1421 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1422 {
1423         thread->uniform1i[command->index] = command->val;
1424 }
1425 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1426 {
1427         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1428         command->index = index;
1429         command->val = i0;
1430
1431         dpsoftrast.uniform1i[command->index] = i0;
1432 }
1433
1434 #ifdef SSE2_PRESENT
1435 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1436 {
1437         float *end = dst + size*4;
1438         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1439         {
1440                 while (dst < end)
1441                 {
1442                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1443                         dst += 4;
1444                         src += stride;
1445                 }
1446         }
1447         else
1448         {
1449                 while (dst < end)
1450                 {
1451                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1452                         dst += 4;
1453                         src += stride;
1454                 }
1455         }
1456 }
1457
1458 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1459 {
1460         float *end = dst + size*4;
1461         if (stride == sizeof(float[3]))
1462         {
1463                 float *end4 = dst + (size&~3)*4;        
1464                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1465                 {
1466                         while (dst < end4)
1467                         {
1468                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1469                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1470                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1471                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1472                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1473                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1474                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1475                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1476                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1477                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1478                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1479                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1480                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1481                                 dst += 16;
1482                                 src += 4*sizeof(float[3]);
1483                         }
1484                 }
1485                 else
1486                 {
1487                         while (dst < end4)
1488                         {
1489                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1490                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1491                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1492                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1493                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1494                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1495                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1496                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1497                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1498                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1499                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1500                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1501                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1502                                 dst += 16;
1503                                 src += 4*sizeof(float[3]);
1504                         }
1505                 }
1506         }
1507         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1508         {
1509                 while (dst < end)
1510                 {
1511                         __m128 v = _mm_loadu_ps((const float *)src);
1512                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1513                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1514                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1515                         _mm_store_ps(dst, v);
1516                         dst += 4;
1517                         src += stride;
1518                 }
1519         }
1520         else
1521         {
1522                 while (dst < end)
1523                 {
1524                         __m128 v = _mm_load_ps((const float *)src);
1525                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1526                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1527                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1528                         _mm_store_ps(dst, v);
1529                         dst += 4;
1530                         src += stride;
1531                 }
1532         }
1533 }
1534
1535 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1536 {
1537         float *end = dst + size*4;
1538         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1539         if (stride == sizeof(float[2]))
1540         {
1541                 float *end2 = dst + (size&~1)*4;
1542                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1543                 {
1544                         while (dst < end2)
1545                         {
1546                                 __m128 v = _mm_loadu_ps((const float *)src);
1547                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1548                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1549                                 dst += 8;
1550                                 src += 2*sizeof(float[2]);
1551                         }
1552                 }
1553                 else
1554                 {
1555                         while (dst < end2)
1556                         {
1557                                 __m128 v = _mm_load_ps((const float *)src);
1558                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1559                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1560                                 dst += 8;
1561                                 src += 2*sizeof(float[2]);
1562                         }
1563                 }
1564         }
1565         while (dst < end)
1566         {
1567                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1568                 dst += 4;
1569                 src += stride;
1570         }
1571 }
1572
1573 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1574 {
1575         float *end = dst + size*4;
1576         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1577         if (stride == sizeof(unsigned char[4]))
1578         {
1579                 float *end4 = dst + (size&~3)*4;
1580                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1581                 {
1582                         while (dst < end4)
1583                         {
1584                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1585                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1586                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1587                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1588                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1589                                 dst += 16;
1590                                 src += 4*sizeof(unsigned char[4]);
1591                         }
1592                 }
1593                 else
1594                 {
1595                         while (dst < end4)
1596                         {
1597                                 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1598                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1599                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1600                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1601                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1602                                 dst += 16;
1603                                 src += 4*sizeof(unsigned char[4]);
1604                         }
1605                 }
1606         }
1607         while (dst < end)
1608         {
1609                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1610                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1611                 dst += 4;
1612                 src += stride;
1613         }
1614 }
1615
1616 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1617 {
1618         float *end = dst + 4*size;
1619         __m128 v = _mm_loadu_ps(src);
1620         while (dst < end)
1621         {
1622                 _mm_store_ps(dst, v);
1623                 dst += 4;
1624         }
1625 }
1626 #endif
1627
1628 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1629 {
1630 #ifdef SSE2_PRESENT
1631         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1632         __m128 m0, m1, m2, m3;
1633         float *end;
1634         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1635         {
1636                 // fast case for identity matrix
1637                 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1638                 return;
1639         }
1640         end = out4f + numitems*4;
1641         m0 = _mm_loadu_ps(inmatrix16f);
1642         m1 = _mm_loadu_ps(inmatrix16f + 4);
1643         m2 = _mm_loadu_ps(inmatrix16f + 8);
1644         m3 = _mm_loadu_ps(inmatrix16f + 12);
1645         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1646         {
1647                 while (out4f < end)
1648                 {
1649                         __m128 v = _mm_loadu_ps(in4f);
1650                         _mm_store_ps(out4f,
1651                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1652                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1653                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1654                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1655                         out4f += 4;
1656                         in4f += 4;
1657                 }
1658         }
1659         else
1660         {
1661                 while (out4f < end)
1662                 {
1663                         __m128 v = _mm_load_ps(in4f);
1664                         _mm_store_ps(out4f,
1665                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1666                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1667                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1668                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1669                         out4f += 4;
1670                         in4f += 4;
1671                 }
1672         }
1673 #endif
1674 }
1675
1676 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1677 {
1678         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1679 }
1680
1681 #ifdef SSE2_PRESENT
1682 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1683 { \
1684         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1685         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1686         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1687         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1688 }
1689
1690 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1691 { \
1692         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1693         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1694         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1695         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1696 }
1697
1698 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1699 { \
1700         __m128 p = (in); \
1701         out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1702                                                   _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1703                                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1704                                                                                         _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1705 }
1706
1707 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, __m128 minpos, __m128 maxpos, __m128 viewportcenter, __m128 viewportscale, __m128 m0, __m128 m1, __m128 m2, __m128 m3)
1708 {
1709         int clipmask = 0xFF;
1710         __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1711         m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1712         m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1713         m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1714         m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1715         #define BBFRONT(k, pos) \
1716         { \
1717                 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1718                 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1719                 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1720                 { \
1721                         __m128 proj; \
1722                         clipmask &= ~(1<<k); \
1723                         proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1724                         minproj = _mm_min_ss(minproj, proj); \
1725                         maxproj = _mm_max_ss(maxproj, proj); \
1726                 } \
1727         }
1728         BBFRONT(0, minpos); 
1729         BBFRONT(1, _mm_move_ss(minpos, maxpos)); 
1730         BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1731         BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1732         BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1733         BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1734         BBFRONT(6, _mm_move_ss(maxpos, minpos)); 
1735         BBFRONT(7, maxpos);
1736         #define BBCLIP(k) \
1737         { \
1738                 if (clipmask&(1<<k)) \
1739                 { \
1740                         if (!(clipmask&(1<<(k^1)))) \
1741                         { \
1742                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1743                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1744                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1745                                 minproj = _mm_min_ss(minproj, proj); \
1746                                 maxproj = _mm_max_ss(maxproj, proj); \
1747                         } \
1748                         if (!(clipmask&(1<<(k^2)))) \
1749                         { \
1750                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1751                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1752                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1753                                 minproj = _mm_min_ss(minproj, proj); \
1754                                 maxproj = _mm_max_ss(maxproj, proj); \
1755                         } \
1756                         if (!(clipmask&(1<<(k^4)))) \
1757                         { \
1758                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1759                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1760                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1761                                 minproj = _mm_min_ss(minproj, proj); \
1762                                 maxproj = _mm_max_ss(maxproj, proj); \
1763                         } \
1764                 } \
1765         }
1766         BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1767         viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1768         viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1769         minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1770         maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1771         minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1772         maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1773         *starty = _mm_cvttss_si32(maxproj);
1774         *endy = _mm_cvttss_si32(minproj)+1;
1775         return clipmask;
1776 }
1777         
1778 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1779 {
1780         float *end = out4f + numitems*4;
1781         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1782         __m128 minpos, maxpos;
1783         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1784         {
1785                 minpos = maxpos = _mm_loadu_ps(in4f);
1786                 while (out4f < end)
1787                 {
1788                         __m128 v = _mm_loadu_ps(in4f);
1789                         minpos = _mm_min_ps(minpos, v);
1790                         maxpos = _mm_max_ps(maxpos, v);
1791                         _mm_store_ps(out4f, v);
1792                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1793                         _mm_store_ps(screen4f, v);
1794                         in4f += 4;
1795                         out4f += 4;
1796                         screen4f += 4;
1797                 }
1798         }
1799         else
1800         {
1801                 minpos = maxpos = _mm_load_ps(in4f);
1802                 while (out4f < end)
1803                 {
1804                         __m128 v = _mm_load_ps(in4f);
1805                         minpos = _mm_min_ps(minpos, v);
1806                         maxpos = _mm_max_ps(maxpos, v);
1807                         _mm_store_ps(out4f, v);
1808                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1809                         _mm_store_ps(screen4f, v);
1810                         in4f += 4;
1811                         out4f += 4;
1812                         screen4f += 4;
1813                 }
1814         }
1815         if (starty && endy) 
1816                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, 
1817                                         _mm_setr_ps(1.0f, 0.0f, 0.0f, 0.0f),
1818                                         _mm_setr_ps(0.0f, 1.0f, 0.0f, 0.0f),
1819                                         _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f),
1820                                         _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f));
1821         return 0;
1822 }
1823
1824 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1825 {
1826         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1827         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1828         float *end;
1829         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1830                 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1831         end = out4f + numitems*4;
1832         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1833         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1834         m0 = _mm_loadu_ps(inmatrix16f);
1835         m1 = _mm_loadu_ps(inmatrix16f + 4);
1836         m2 = _mm_loadu_ps(inmatrix16f + 8);
1837         m3 = _mm_loadu_ps(inmatrix16f + 12);
1838         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1839         {
1840                 minpos = maxpos = _mm_loadu_ps(in4f);
1841                 while (out4f < end)
1842                 {
1843                         __m128 v = _mm_loadu_ps(in4f);
1844                         minpos = _mm_min_ps(minpos, v);
1845                         maxpos = _mm_max_ps(maxpos, v);
1846                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1847                         _mm_store_ps(out4f, v);
1848                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1849                         _mm_store_ps(screen4f, v);
1850                         in4f += 4;
1851                         out4f += 4;
1852                         screen4f += 4;
1853                 }
1854         }
1855         else
1856         {
1857                 minpos = maxpos = _mm_load_ps(in4f);
1858                 while (out4f < end)
1859                 {
1860                         __m128 v = _mm_load_ps(in4f);
1861                         minpos = _mm_min_ps(minpos, v);
1862                         maxpos = _mm_max_ps(maxpos, v);
1863                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1864                         _mm_store_ps(out4f, v);
1865                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1866                         _mm_store_ps(screen4f, v);
1867                         in4f += 4;
1868                         out4f += 4;
1869                         screen4f += 4;
1870                 }
1871         }
1872         if (starty && endy) 
1873                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, m0, m1, m2, m3); 
1874         return 0;
1875 }
1876 #endif
1877
1878 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1879 {
1880 #ifdef SSE2_PRESENT
1881         float *outf = dpsoftrast.post_array4f[outarray];
1882         const unsigned char *inb;
1883         int firstvertex = dpsoftrast.firstvertex;
1884         int numvertices = dpsoftrast.numvertices;
1885         int stride;
1886         switch(inarray)
1887         {
1888         case DPSOFTRAST_ARRAY_POSITION:
1889                 stride = dpsoftrast.stride_vertex;
1890                 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1891                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1892                 break;
1893         case DPSOFTRAST_ARRAY_COLOR:
1894                 stride = dpsoftrast.stride_color;
1895                 if (dpsoftrast.pointer_color4f)
1896                 {
1897                         inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1898                         DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1899                 }
1900                 else if (dpsoftrast.pointer_color4ub)
1901                 {
1902                         stride = dpsoftrast.stride_color;
1903                         inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1904                         DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1905                 }
1906                 else
1907                 {
1908                         DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1909                 }
1910                 break;
1911         default:
1912                 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1913                 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1914                 {
1915                         inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1916                         switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1917                         {
1918                         case 2:
1919                                 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1920                                 break;
1921                         case 3:
1922                                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1923                                 break;
1924                         case 4:
1925                                 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1926                                 break;
1927                         }
1928                 }
1929                 break;
1930         }
1931         return outf;
1932 #else
1933         return NULL;
1934 #endif
1935 }
1936
1937 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1938 {
1939         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1940         DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1941         return data;
1942 }
1943
1944 #if 0
1945 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1946 {
1947 #ifdef SSE2_PRESENT
1948         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1949         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1950         return data;
1951 #else
1952         return NULL;
1953 #endif
1954 }
1955 #endif
1956
1957 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1958 {
1959 #ifdef SSE2_PRESENT
1960         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1961         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
1962         return data;
1963 #else
1964         return NULL;
1965 #endif
1966 }
1967
1968 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1969 {
1970         int x;
1971         int startx = span->startx;
1972         int endx = span->endx;
1973         float wslope = triangle->w[0];
1974         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1975         float endz = 1.0f / (w + wslope * startx);
1976         for (x = startx;x < endx;)
1977         {
1978                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1979                 float z = endz, dz;
1980                 if (nextsub >= endx) nextsub = endsub = endx-1;
1981                 endz = 1.0f / (w + wslope * nextsub);
1982                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1983                 for (; x <= endsub; x++, z += dz)
1984                         zf[x] = z;
1985         }
1986 }
1987
1988 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
1989 {
1990         int x;
1991         int startx = span->startx;
1992         int endx = span->endx;
1993         int d[4];
1994         float a, b;
1995         unsigned char * RESTRICT pixelmask = span->pixelmask;
1996         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1997         if (!pixel)
1998                 return;
1999         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2000         // handle alphatest now (this affects depth writes too)
2001         if (thread->alphatest)
2002                 for (x = startx;x < endx;x++)
2003                         if (in4f[x*4+3] < 0.5f)
2004                                 pixelmask[x] = false;
2005         // FIXME: this does not handle bigendian
2006         switch(thread->fb_blendmode)
2007         {
2008         case DPSOFTRAST_BLENDMODE_OPAQUE:
2009                 for (x = startx;x < endx;x++)
2010                 {
2011                         if (!pixelmask[x])
2012                                 continue;
2013                         d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
2014                         d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
2015                         d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
2016                         d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
2017                         pixel[x*4+0] = d[0];
2018                         pixel[x*4+1] = d[1];
2019                         pixel[x*4+2] = d[2];
2020                         pixel[x*4+3] = d[3];
2021                 }
2022                 break;
2023         case DPSOFTRAST_BLENDMODE_ALPHA:
2024                 for (x = startx;x < endx;x++)
2025                 {
2026                         if (!pixelmask[x])
2027                                 continue;
2028                         a = in4f[x*4+3] * 255.0f;
2029                         b = 1.0f - in4f[x*4+3];
2030                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2031                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2032                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2033                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2034                         pixel[x*4+0] = d[0];
2035                         pixel[x*4+1] = d[1];
2036                         pixel[x*4+2] = d[2];
2037                         pixel[x*4+3] = d[3];
2038                 }
2039                 break;
2040         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2041                 for (x = startx;x < endx;x++)
2042                 {
2043                         if (!pixelmask[x])
2044                                 continue;
2045                         a = in4f[x*4+3] * 255.0f;
2046                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2047                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2048                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2049                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2050                         pixel[x*4+0] = d[0];
2051                         pixel[x*4+1] = d[1];
2052                         pixel[x*4+2] = d[2];
2053                         pixel[x*4+3] = d[3];
2054                 }
2055                 break;
2056         case DPSOFTRAST_BLENDMODE_ADD:
2057                 for (x = startx;x < endx;x++)
2058                 {
2059                         if (!pixelmask[x])
2060                                 continue;
2061                         d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2062                         d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2063                         d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2064                         d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2065                         pixel[x*4+0] = d[0];
2066                         pixel[x*4+1] = d[1];
2067                         pixel[x*4+2] = d[2];
2068                         pixel[x*4+3] = d[3];
2069                 }
2070                 break;
2071         case DPSOFTRAST_BLENDMODE_INVMOD:
2072                 for (x = startx;x < endx;x++)
2073                 {
2074                         if (!pixelmask[x])
2075                                 continue;
2076                         d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2077                         d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2078                         d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2079                         d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2080                         pixel[x*4+0] = d[0];
2081                         pixel[x*4+1] = d[1];
2082                         pixel[x*4+2] = d[2];
2083                         pixel[x*4+3] = d[3];
2084                 }
2085                 break;
2086         case DPSOFTRAST_BLENDMODE_MUL:
2087                 for (x = startx;x < endx;x++)
2088                 {
2089                         if (!pixelmask[x])
2090                                 continue;
2091                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2092                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2093                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2094                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2095                         pixel[x*4+0] = d[0];
2096                         pixel[x*4+1] = d[1];
2097                         pixel[x*4+2] = d[2];
2098                         pixel[x*4+3] = d[3];
2099                 }
2100                 break;
2101         case DPSOFTRAST_BLENDMODE_MUL2:
2102                 for (x = startx;x < endx;x++)
2103                 {
2104                         if (!pixelmask[x])
2105                                 continue;
2106                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2107                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2108                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2109                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2110                         pixel[x*4+0] = d[0];
2111                         pixel[x*4+1] = d[1];
2112                         pixel[x*4+2] = d[2];
2113                         pixel[x*4+3] = d[3];
2114                 }
2115                 break;
2116         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2117                 for (x = startx;x < endx;x++)
2118                 {
2119                         if (!pixelmask[x])
2120                                 continue;
2121                         a = in4f[x*4+3] * -255.0f;
2122                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2123                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2124                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2125                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2126                         pixel[x*4+0] = d[0];
2127                         pixel[x*4+1] = d[1];
2128                         pixel[x*4+2] = d[2];
2129                         pixel[x*4+3] = d[3];
2130                 }
2131                 break;
2132         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2133                 for (x = startx;x < endx;x++)
2134                 {
2135                         if (!pixelmask[x])
2136                                 continue;
2137                         a = 255.0f;
2138                         b = 1.0f - in4f[x*4+3];
2139                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2140                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2141                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2142                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2143                         pixel[x*4+0] = d[0];
2144                         pixel[x*4+1] = d[1];
2145                         pixel[x*4+2] = d[2];
2146                         pixel[x*4+3] = d[3];
2147                 }
2148                 break;
2149         case DPSOFTRAST_BLENDMODE_INVADD:
2150                 for (x = startx;x < endx;x++)
2151                 {
2152                         if (!pixelmask[x])
2153                                 continue;
2154                         d[0] = (int)((255.0f-pixel[x*4+2])*in4f[x*4+0] + pixel[x*4+2]);if (d[0] > 255) d[0] = 255;
2155                         d[1] = (int)((255.0f-pixel[x*4+1])*in4f[x*4+1] + pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2156                         d[2] = (int)((255.0f-pixel[x*4+0])*in4f[x*4+2] + pixel[x*4+0]);if (d[2] > 255) d[2] = 255;
2157                         d[3] = (int)((255.0f-pixel[x*4+3])*in4f[x*4+3] + pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2158                         pixel[x*4+0] = d[0];
2159                         pixel[x*4+1] = d[1];
2160                         pixel[x*4+2] = d[2];
2161                         pixel[x*4+3] = d[3];
2162                 }
2163                 break;
2164         }
2165 }
2166
2167 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2168 {
2169 #ifdef SSE2_PRESENT
2170         int x;
2171         int startx = span->startx;
2172         int endx = span->endx;
2173         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2174         unsigned char * RESTRICT pixelmask = span->pixelmask;
2175         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2176         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2177         if (!pixel)
2178                 return;
2179         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2180         pixeli += span->y * dpsoftrast.fb_width + span->x;
2181         // handle alphatest now (this affects depth writes too)
2182         if (thread->alphatest)
2183                 for (x = startx;x < endx;x++)
2184                         if (in4ub[x*4+3] < 0.5f)
2185                                 pixelmask[x] = false;
2186         // FIXME: this does not handle bigendian
2187         switch(thread->fb_blendmode)
2188         {
2189         case DPSOFTRAST_BLENDMODE_OPAQUE:
2190                 for (x = startx;x + 4 <= endx;)
2191                 {
2192                         if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2193                         {
2194                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2195                                 x += 4;
2196                         }
2197                         else
2198                         {
2199                                 if (pixelmask[x])
2200                                         pixeli[x] = ini[x];
2201                                 x++;
2202                         }
2203                 }
2204                 for (;x < endx;x++)
2205                         if (pixelmask[x])
2206                                 pixeli[x] = ini[x];
2207                 break;
2208         case DPSOFTRAST_BLENDMODE_ALPHA:
2209         #define FINISHBLEND(blend2, blend1) \
2210                 for (x = startx;x + 1 < endx;x += 2) \
2211                 { \
2212                         __m128i src, dst; \
2213                         switch (*(const unsigned short*)&pixelmask[x]) \
2214                         { \
2215                         case 0x0101: \
2216                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2217                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2218                                 blend2; \
2219                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2220                                 continue; \
2221                         case 0x0100: \
2222                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2223                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2224                                 blend1; \
2225                                 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst));  \
2226                                 continue; \
2227                         case 0x0001: \
2228                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2229                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2230                                 blend1; \
2231                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2232                                 continue; \
2233                         } \
2234                         break; \
2235                 } \
2236                 for(;x < endx; x++) \
2237                 { \
2238                         __m128i src, dst; \
2239                         if (!pixelmask[x]) \
2240                                 continue; \
2241                         src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2242                         dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2243                         blend1; \
2244                         pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2245                 }
2246
2247                 FINISHBLEND({
2248                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2249                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2250                 }, {
2251                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2252                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2253                 });
2254                 break;
2255         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2256                 FINISHBLEND({
2257                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2258                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2259                 }, {
2260                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2261                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2262                 });
2263                 break;
2264         case DPSOFTRAST_BLENDMODE_ADD:
2265                 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2266                 break;
2267         case DPSOFTRAST_BLENDMODE_INVMOD:
2268                 FINISHBLEND({
2269                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2270                 }, {
2271                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2272                 });
2273                 break;
2274         case DPSOFTRAST_BLENDMODE_MUL:
2275                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2276                 break;
2277         case DPSOFTRAST_BLENDMODE_MUL2:
2278                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2279                 break;
2280         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2281                 FINISHBLEND({
2282                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2283                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2284                 }, {
2285                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2286                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2287                 });
2288                 break;
2289         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2290                 FINISHBLEND({
2291                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2292                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2293                 }, {
2294                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2295                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2296                 });
2297                 break;
2298         case DPSOFTRAST_BLENDMODE_INVADD:
2299                 FINISHBLEND({
2300                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2301                 }, {
2302                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2303                 });
2304                 break;
2305         }
2306 #endif
2307 }
2308
2309 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2310 {
2311         int x;
2312         int startx = span->startx;
2313         int endx = span->endx;
2314         int flags;
2315         float c[4];
2316         float data[4];
2317         float slope[4];
2318         float tc[2], endtc[2];
2319         float tcscale[2];
2320         unsigned int tci[2];
2321         unsigned int tci1[2];
2322         unsigned int tcimin[2];
2323         unsigned int tcimax[2];
2324         int tciwrapmask[2];
2325         int tciwidth;
2326         int filter;
2327         int mip;
2328         const unsigned char * RESTRICT pixelbase;
2329         const unsigned char * RESTRICT pixel[4];
2330         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2331         // if no texture is bound, just fill it with white
2332         if (!texture)
2333         {
2334                 for (x = startx;x < endx;x++)
2335                 {
2336                         out4f[x*4+0] = 1.0f;
2337                         out4f[x*4+1] = 1.0f;
2338                         out4f[x*4+2] = 1.0f;
2339                         out4f[x*4+3] = 1.0f;
2340                 }
2341                 return;
2342         }
2343         mip = triangle->mip[texunitindex];
2344         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2345         // if this mipmap of the texture is 1 pixel, just fill it with that color
2346         if (texture->mipmap[mip][1] == 4)
2347         {
2348                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2349                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2350                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2351                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2352                 for (x = startx;x < endx;x++)
2353                 {
2354                         out4f[x*4+0] = c[0];
2355                         out4f[x*4+1] = c[1];
2356                         out4f[x*4+2] = c[2];
2357                         out4f[x*4+3] = c[3];
2358                 }
2359                 return;
2360         }
2361         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2362         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2363         flags = texture->flags;
2364         tcscale[0] = texture->mipmap[mip][2];
2365         tcscale[1] = texture->mipmap[mip][3];
2366         tciwidth = texture->mipmap[mip][2];
2367         tcimin[0] = 0;
2368         tcimin[1] = 0;
2369         tcimax[0] = texture->mipmap[mip][2]-1;
2370         tcimax[1] = texture->mipmap[mip][3]-1;
2371         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2372         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2373         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
2374         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
2375         for (x = startx;x < endx;)
2376         {
2377                 unsigned int subtc[2];
2378                 unsigned int substep[2];
2379                 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2380                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2381                 if (nextsub >= endx)
2382                 {
2383                         nextsub = endsub = endx-1;      
2384                         if (x < nextsub) subscale = 65536.0f / (nextsub - x);
2385                 }
2386                 tc[0] = endtc[0];
2387                 tc[1] = endtc[1];
2388                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
2389                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
2390                 substep[0] = (endtc[0] - tc[0]) * subscale;
2391                 substep[1] = (endtc[1] - tc[1]) * subscale;
2392                 subtc[0] = tc[0] * (1<<16);
2393                 subtc[1] = tc[1] * (1<<16);
2394                 if (filter)
2395                 {
2396                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2397                         {
2398                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2399                                 {
2400                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2401                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2402                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2403                                         tci[0] = subtc[0]>>16;
2404                                         tci[1] = subtc[1]>>16;
2405                                         tci1[0] = tci[0] + 1;
2406                                         tci1[1] = tci[1] + 1;
2407                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2408                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2409                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2410                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2411                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2412                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2413                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2414                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2415                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2416                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2417                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2418                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2419                                         out4f[x*4+0] = c[0];
2420                                         out4f[x*4+1] = c[1];
2421                                         out4f[x*4+2] = c[2];
2422                                         out4f[x*4+3] = c[3];
2423                                 }
2424                         }
2425                         else
2426                         {
2427                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2428                                 {
2429                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2430                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2431                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2432                                         tci[0] = subtc[0]>>16;
2433                                         tci[1] = subtc[1]>>16;
2434                                         tci1[0] = tci[0] + 1;
2435                                         tci1[1] = tci[1] + 1;
2436                                         tci[0] &= tciwrapmask[0];
2437                                         tci[1] &= tciwrapmask[1];
2438                                         tci1[0] &= tciwrapmask[0];
2439                                         tci1[1] &= tciwrapmask[1];
2440                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2441                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2442                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2443                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2444                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2445                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2446                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2447                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2448                                         out4f[x*4+0] = c[0];
2449                                         out4f[x*4+1] = c[1];
2450                                         out4f[x*4+2] = c[2];
2451                                         out4f[x*4+3] = c[3];
2452                                 }
2453                         }
2454                 }
2455                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2456                 {
2457                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2458                         {
2459                                 tci[0] = subtc[0]>>16;
2460                                 tci[1] = subtc[1]>>16;
2461                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2462                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2463                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2464                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2465                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2466                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2467                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2468                                 out4f[x*4+0] = c[0];
2469                                 out4f[x*4+1] = c[1];
2470                                 out4f[x*4+2] = c[2];
2471                                 out4f[x*4+3] = c[3];
2472                         }
2473                 }
2474                 else
2475                 {
2476                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2477                         {
2478                                 tci[0] = subtc[0]>>16;
2479                                 tci[1] = subtc[1]>>16;
2480                                 tci[0] &= tciwrapmask[0];
2481                                 tci[1] &= tciwrapmask[1];
2482                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2483                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2484                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2485                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2486                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2487                                 out4f[x*4+0] = c[0];
2488                                 out4f[x*4+1] = c[1];
2489                                 out4f[x*4+2] = c[2];
2490                                 out4f[x*4+3] = c[3];
2491                         }
2492                 }
2493         }
2494 }
2495
2496 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2497 {
2498 #ifdef SSE2_PRESENT
2499         int x;
2500         int startx = span->startx;
2501         int endx = span->endx;
2502         int flags;
2503         __m128 data, slope, tcscale;
2504         __m128i tcsize, tcmask, tcoffset, tcmax;
2505         __m128 tc, endtc;
2506         __m128i subtc, substep, endsubtc;
2507         int filter;
2508         int mip;
2509         unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2510         const unsigned char * RESTRICT pixelbase;
2511         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2512         // if no texture is bound, just fill it with white
2513         if (!texture)
2514         {
2515                 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2516                 return;
2517         }
2518         mip = triangle->mip[texunitindex];
2519         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2520         // if this mipmap of the texture is 1 pixel, just fill it with that color
2521         if (texture->mipmap[mip][1] == 4)
2522         {
2523                 unsigned int k = *((const unsigned int *)pixelbase);
2524                 for (x = startx;x < endx;x++)
2525                         outi[x] = k;
2526                 return;
2527         }
2528         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2529         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2530         flags = texture->flags;
2531         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2532         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2533         tcscale = _mm_cvtepi32_ps(tcsize);
2534         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2535         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2536         endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
2537         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2538         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2539         tcmax = _mm_packs_epi32(tcmask, tcmask);
2540         for (x = startx;x < endx;)
2541         {
2542                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2543                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2544                 if (nextsub >= endx)
2545                 {
2546                         nextsub = endsub = endx-1;
2547                         if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2548                 }       
2549                 tc = endtc;
2550                 subtc = endsubtc;
2551                 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
2552                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2553                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2554                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2555                 substep = _mm_slli_epi32(substep, 1);
2556                 if (filter)
2557                 {
2558                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2559                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2560                         {
2561                                 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2562                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2563                                 {
2564                                         const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;                   
2565                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2566                                         tci = _mm_madd_epi16(tci, tcoffset);
2567                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2568                                         ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2569                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2570                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2571                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2572                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2573                                         fracm = _mm_srli_epi16(subtc, 1);
2574                                         pix1 = _mm_add_epi16(pix1,
2575                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2576                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2577                                         pix3 = _mm_add_epi16(pix3,
2578                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2579                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2580                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2581                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2582                                         pix2 = _mm_add_epi16(pix2,
2583                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2584                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2585                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2586                                 }
2587                                 if (x <= endsub)
2588                                 {
2589                                         const unsigned char * RESTRICT ptr1;
2590                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2591                                         tci = _mm_madd_epi16(tci, tcoffset);
2592                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2593                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2594                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2595                                         fracm = _mm_srli_epi16(subtc, 1);
2596                                         pix1 = _mm_add_epi16(pix1,
2597                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2598                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2599                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2600                                         pix1 = _mm_add_epi16(pix1,
2601                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2602                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2603                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2604                                         x++;
2605                                 }
2606                         }
2607                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2608                         {
2609                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2610                                 {
2611                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2612                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2613                                         tci = _mm_madd_epi16(tci, tcoffset);
2614                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2615                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2616                                                                                         _mm_setzero_si128());
2617                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2618                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2619                                                                                         _mm_setzero_si128());
2620                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2621                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2622                                         tci = _mm_madd_epi16(tci, tcoffset);
2623                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2624                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2625                                                                                         _mm_setzero_si128());
2626                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2627                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2628                                                                                         _mm_setzero_si128());
2629                                         fracm = _mm_srli_epi16(subtc, 1);
2630                                         pix1 = _mm_add_epi16(pix1,
2631                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2632                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2633                                         pix3 = _mm_add_epi16(pix3,
2634                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2635                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2636                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2637                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2638                                         pix2 = _mm_add_epi16(pix2,
2639                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2640                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2641                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2642                                 }
2643                                 if (x <= endsub)
2644                                 {
2645                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2646                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2647                                         tci = _mm_madd_epi16(tci, tcoffset);
2648                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2649                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2650                                                                                         _mm_setzero_si128());
2651                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2652                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2653                                                                                         _mm_setzero_si128());
2654                                         fracm = _mm_srli_epi16(subtc, 1);
2655                                         pix1 = _mm_add_epi16(pix1,
2656                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2657                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2658                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2659                                         pix1 = _mm_add_epi16(pix1,
2660                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2661                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2662                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2663                                         x++;
2664                                 }
2665                         }
2666                         else
2667                         {
2668                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2669                                 {
2670                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2671                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2672                                         tci = _mm_madd_epi16(tci, tcoffset);
2673                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2674                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2675                                                                                         _mm_setzero_si128());
2676                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2677                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2678                                                                                         _mm_setzero_si128());
2679                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2680                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2681                                         tci = _mm_madd_epi16(tci, tcoffset);
2682                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2683                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2684                                                                                         _mm_setzero_si128());
2685                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2686                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2687                                                                                         _mm_setzero_si128());
2688                                         fracm = _mm_srli_epi16(subtc, 1);
2689                                         pix1 = _mm_add_epi16(pix1,
2690                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2691                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2692                                         pix3 = _mm_add_epi16(pix3,
2693                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2694                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2695                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2696                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2697                                         pix2 = _mm_add_epi16(pix2,
2698                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2699                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2700                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2701                                 }
2702                                 if (x <= endsub)
2703                                 {
2704                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2705                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2706                                         tci = _mm_madd_epi16(tci, tcoffset);
2707                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2708                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2709                                                                                         _mm_setzero_si128());
2710                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2711                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2712                                                                                         _mm_setzero_si128());
2713                                         fracm = _mm_srli_epi16(subtc, 1);
2714                                         pix1 = _mm_add_epi16(pix1,
2715                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2716                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2717                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2718                                         pix1 = _mm_add_epi16(pix1,
2719                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2720                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2721                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2722                                         x++;
2723                                 }
2724                         }
2725                 }
2726                 else
2727                 {
2728                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2729                         {
2730                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2731                                 {
2732                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2733                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2734                                         tci = _mm_madd_epi16(tci, tcoffset);
2735                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2736                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2737                                 }
2738                                 if (x <= endsub)
2739                                 {
2740                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2741                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2742                                         tci = _mm_madd_epi16(tci, tcoffset);
2743                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2744                                         x++;
2745                                 }
2746                         }
2747                         else
2748                         {
2749                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2750                                 {
2751                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2752                                         tci = _mm_and_si128(tci, tcmax); 
2753                                         tci = _mm_madd_epi16(tci, tcoffset);
2754                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2755                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2756                                 }
2757                                 if (x <= endsub)
2758                                 {
2759                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2760                                         tci = _mm_and_si128(tci, tcmax); 
2761                                         tci = _mm_madd_epi16(tci, tcoffset);
2762                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2763                                         x++;
2764                                 }
2765                         }
2766                 }
2767         }
2768 #endif
2769 }
2770
2771 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2772 {
2773         // TODO: IMPLEMENT
2774         memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2775 }
2776
2777 float DPSOFTRAST_SampleShadowmap(const float *vector)
2778 {
2779         // TODO: IMPLEMENT
2780         return 1.0f;
2781 }
2782
2783 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2784 {
2785         int x;
2786         int startx = span->startx;
2787         int endx = span->endx;
2788         float c[4];
2789         float data[4];
2790         float slope[4];
2791         float z;
2792         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2793         for (x = startx;x < endx;x++)
2794         {
2795                 z = zf[x];
2796                 c[0] = (data[0] + slope[0]*x) * z;
2797                 c[1] = (data[1] + slope[1]*x) * z;
2798                 c[2] = (data[2] + slope[2]*x) * z;
2799                 c[3] = (data[3] + slope[3]*x) * z;
2800                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2801                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2802                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2803                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2804         }
2805 }
2806
2807 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2808 {
2809         int x;
2810         int startx = span->startx;
2811         int endx = span->endx;
2812         float c[4];
2813         float data[4];
2814         float slope[4];
2815         float z;
2816         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2817         for (x = startx;x < endx;x++)
2818         {
2819                 z = zf[x];
2820                 c[0] = (data[0] + slope[0]*x) * z;
2821                 c[1] = (data[1] + slope[1]*x) * z;
2822                 c[2] = (data[2] + slope[2]*x) * z;
2823                 c[3] = (data[3] + slope[3]*x) * z;
2824                 out4f[x*4+0] = c[0];
2825                 out4f[x*4+1] = c[1];
2826                 out4f[x*4+2] = c[2];
2827                 out4f[x*4+3] = c[3];
2828         }
2829 }
2830
2831 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2832 {
2833         int x, startx = span->startx, endx = span->endx;
2834         float c[4], localcolor[4];
2835         localcolor[0] = subcolor[0];
2836         localcolor[1] = subcolor[1];
2837         localcolor[2] = subcolor[2];
2838         localcolor[3] = subcolor[3];
2839         for (x = startx;x < endx;x++)
2840         {
2841                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2842                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2843                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2844                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2845                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2846                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2847                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2848                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2849         }
2850 }
2851
2852 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2853 {
2854         int x, startx = span->startx, endx = span->endx;
2855         for (x = startx;x < endx;x++)
2856         {
2857                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2858                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2859                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2860                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2861         }
2862 }
2863
2864 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2865 {
2866         int x, startx = span->startx, endx = span->endx;
2867         for (x = startx;x < endx;x++)
2868         {
2869                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2870                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2871                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2872                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2873         }
2874 }
2875
2876 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2877 {
2878         int x, startx = span->startx, endx = span->endx;
2879         float a, b;
2880         for (x = startx;x < endx;x++)
2881         {
2882                 a = 1.0f - inb4f[x*4+3];
2883                 b = inb4f[x*4+3];
2884                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2885                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2886                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2887                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2888         }
2889 }
2890
2891 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2892 {
2893         int x, startx = span->startx, endx = span->endx;
2894         float localcolor[4], ilerp, lerp;
2895         localcolor[0] = color[0];
2896         localcolor[1] = color[1];
2897         localcolor[2] = color[2];
2898         localcolor[3] = color[3];
2899         ilerp = 1.0f - localcolor[3];
2900         lerp = localcolor[3];
2901         for (x = startx;x < endx;x++)
2902         {
2903                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2904                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2905                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2906                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2907         }
2908 }
2909
2910
2911
2912 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2913 {
2914 #ifdef SSE2_PRESENT
2915         int x;
2916         int startx = span->startx;
2917         int endx = span->endx;
2918         __m128 data, slope;
2919         __m128 mod, endmod;
2920         __m128i submod, substep, endsubmod;
2921         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2922         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2923         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2924         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2925         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2926         for (x = startx; x < endx;)
2927         {
2928                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2929                 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2930                 if (nextsub >= endx)
2931                 {
2932                         nextsub = endsub = endx-1;
2933                         if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2934                 }
2935                 mod = endmod;
2936                 submod = endsubmod;
2937                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2938                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2939                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2940                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2941                 substep = _mm_packs_epi32(substep, substep);
2942                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2943                 {
2944                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2945                         pix = _mm_mulhi_epu16(pix, submod);
2946                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2947                 }
2948                 if (x <= endsub)
2949                 {
2950                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2951                         pix = _mm_mulhi_epu16(pix, submod);
2952                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2953                         x++;
2954                 }
2955         }
2956 #endif
2957 }
2958
2959 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2960 {
2961 #ifdef SSE2_PRESENT
2962         int x;
2963         int startx = span->startx;
2964         int endx = span->endx;
2965         __m128 data, slope;
2966         __m128 mod, endmod;
2967         __m128i submod, substep, endsubmod;
2968         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2969         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2970         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2971         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2972         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2973         for (x = startx; x < endx;)
2974         {
2975                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2976                 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2977                 if (nextsub >= endx)
2978                 {
2979                         nextsub = endsub = endx-1;
2980                         if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
2981                 }
2982                 mod = endmod;
2983                 submod = endsubmod;
2984                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2985                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2986                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2987                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2988                 substep = _mm_packs_epi32(substep, substep);
2989                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2990                 {
2991                         __m128i pix = _mm_srai_epi16(submod, 4);
2992                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2993                 }
2994                 if (x <= endsub)
2995                 {
2996                         __m128i pix = _mm_srai_epi16(submod, 4);
2997                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2998                         x++;
2999                 }
3000         }
3001 #endif
3002 }
3003
3004 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3005 {
3006 #ifdef SSE2_PRESENT
3007         int x, startx = span->startx, endx = span->endx;
3008         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3009         localcolor = _mm_packs_epi32(localcolor, localcolor);
3010         for (x = startx;x+2 <= endx;x+=2)
3011         {
3012                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3013                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3014                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3015                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3016         }
3017         if (x < endx)
3018         {
3019                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3020                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3021                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3022                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3023         }
3024 #endif
3025 }
3026
3027 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3028 {
3029 #ifdef SSE2_PRESENT
3030         int x, startx = span->startx, endx = span->endx;
3031         for (x = startx;x+2 <= endx;x+=2)
3032         {
3033                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3034                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3035                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3036                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3037         }
3038         if (x < endx)
3039         {
3040                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3041                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3042                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3043                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3044         }
3045 #endif
3046 }
3047
3048 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3049 {
3050 #ifdef SSE2_PRESENT
3051         int x, startx = span->startx, endx = span->endx;
3052         for (x = startx;x+2 <= endx;x+=2)
3053         {
3054                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3055                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3056                 pix1 = _mm_add_epi16(pix1, pix2);
3057                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3058         }
3059         if (x < endx)
3060         {
3061                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3062                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3063                 pix1 = _mm_add_epi16(pix1, pix2);
3064                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3065         }
3066 #endif
3067 }
3068
3069 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3070 {
3071 #ifdef SSE2_PRESENT
3072         int x, startx = span->startx, endx = span->endx;
3073         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3074         tint = _mm_packs_epi32(tint, tint);
3075         for (x = startx;x+2 <= endx;x+=2)
3076         {
3077                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3078                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3079                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3080                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3081         }
3082         if (x < endx)
3083         {
3084                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3085                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3086                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3087                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3088         }
3089 #endif
3090 }
3091
3092 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3093 {
3094 #ifdef SSE2_PRESENT
3095         int x, startx = span->startx, endx = span->endx;
3096         for (x = startx;x+2 <= endx;x+=2)
3097         {
3098                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3099                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3100                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3101                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3102                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3103         }
3104         if (x < endx)
3105         {
3106                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3107                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3108                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3109                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3110                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3111         }
3112 #endif
3113 }
3114
3115 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3116 {
3117 #ifdef SSE2_PRESENT
3118         int x, startx = span->startx, endx = span->endx;
3119         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3120         localcolor = _mm_packs_epi32(localcolor, localcolor);
3121         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3122         for (x = startx;x+2 <= endx;x+=2)
3123         {
3124                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3125                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3126                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3127         }
3128         if (x < endx)
3129         {
3130                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3131                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3132                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3133         }
3134 #endif
3135 }
3136
3137
3138
3139 void DPSOFTRAST_VertexShader_Generic(void)
3140 {
3141         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3142         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3143         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3144         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3145                 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3146 }
3147
3148 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3149 {
3150         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3151         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3152         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3153         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3154         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3155         if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3156         {
3157                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3158                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3159                 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3160                 {
3161                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3162                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3163                         {
3164                                 // multiply
3165                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3166                         }
3167                         else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3168                         {
3169                                 // add
3170                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3171                         }
3172                         else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3173                         {
3174                                 // alphablend
3175                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3176                         }
3177                 }
3178         }
3179         else
3180                 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3181         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3182 }
3183
3184
3185
3186 void DPSOFTRAST_VertexShader_PostProcess(void)
3187 {
3188         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3189         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3190         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3191 }
3192
3193 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3194 {
3195         // TODO: optimize!!  at the very least there is no reason to use texture sampling on the frame texture
3196         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3197         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3198         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3199         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3200         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3201         if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3202         {
3203                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3204                 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3205         }
3206         DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3207         if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3208         {
3209                 // TODO: implement saturation
3210         }
3211         if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3212         {
3213                 // TODO: implement gammaramps
3214         }
3215         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3216 }
3217
3218
3219
3220 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3221 {
3222         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3223 }
3224
3225 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3226 {
3227         // this is never called (because colormask is off when this shader is used)
3228         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3229         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3230         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3231         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3232         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3233 }
3234
3235
3236
3237 void DPSOFTRAST_VertexShader_FlatColor(void)
3238 {
3239         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3240         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3241 }
3242
3243 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3244 {
3245 #ifdef SSE2_PRESENT
3246         unsigned char * RESTRICT pixelmask = span->pixelmask;
3247         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3248         int x, startx = span->startx, endx = span->endx;
3249         __m128i Color_Ambientm;
3250         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3251         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3252         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3253         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3254         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3255         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3256                 pixel = buffer_FragColorbgra8;
3257         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3258         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3259         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3260         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3261         for (x = startx;x < endx;x++)
3262         {
3263                 __m128i color, pix;
3264                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3265                 {
3266                         __m128i pix2;
3267                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3268                         pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3269                         pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3270                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3271                         x += 3;
3272                         continue;
3273                 }
3274                 if (!pixelmask[x])
3275                         continue;
3276                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3277                 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3278                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3279         }
3280         if (pixel == buffer_FragColorbgra8)
3281                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3282 #endif
3283 }
3284
3285
3286
3287 void DPSOFTRAST_VertexShader_VertexColor(void)
3288 {
3289         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3290         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3291         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3292 }
3293
3294 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3295 {
3296 #ifdef SSE2_PRESENT
3297         unsigned char * RESTRICT pixelmask = span->pixelmask;
3298         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3299         int x, startx = span->startx, endx = span->endx;
3300         __m128i Color_Ambientm, Color_Diffusem;
3301         __m128 data, slope;
3302         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3303         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3304         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3305         int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3306         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3307         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3308         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3309                 pixel = buffer_FragColorbgra8;
3310         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3311         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3312         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3313         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3314         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3315         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3316         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3317         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3318         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3319         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3320         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3321         data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3322         slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3323         for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3324         {
3325                 __m128i color, mod, pix;
3326                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3327                 {
3328                         __m128i pix2, mod2;
3329                         __m128 z = _mm_loadu_ps(&buffer_z[x]);
3330                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3331                         mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3332                         data = _mm_add_ps(data, slope);
3333                         mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3334                         data = _mm_add_ps(data, slope);
3335                         mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3336                         data = _mm_add_ps(data, slope);
3337                         mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3338                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3339                                                                   _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3340                         pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3341                                                                    _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3342                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3343                         x += 3;
3344                         continue;
3345                 }
3346                 if (!pixelmask[x])
3347                         continue;
3348                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3349                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
3350                 mod = _mm_packs_epi32(mod, mod);
3351                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3352                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3353         }
3354         if (pixel == buffer_FragColorbgra8)
3355                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3356 #endif
3357 }
3358
3359
3360
3361 void DPSOFTRAST_VertexShader_Lightmap(void)
3362 {
3363         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3364         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3365         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3366 }
3367
3368 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3369 {
3370 #ifdef SSE2_PRESENT
3371         unsigned char * RESTRICT pixelmask = span->pixelmask;
3372         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3373         int x, startx = span->startx, endx = span->endx;
3374         __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3375         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3376         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3377         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3378         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3379         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3380         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3381         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3382         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3383         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3384                 pixel = buffer_FragColorbgra8;
3385         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3386         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3387         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3388         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3389         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3390         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3391         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3392         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3393         {
3394                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3395                 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3396                 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3397                 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3398                 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3399                 for (x = startx;x < endx;x++)
3400                 {
3401                         __m128i color, lightmap, glow, pix;
3402                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3403                         {
3404                                 __m128i pix2;
3405                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3406                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3407                                 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3408                                 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3409                                                                                                         _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3410                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3411                                 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3412                                                                                                         _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3413                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3414                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3415                                 x += 3;
3416                                 continue;
3417                         }
3418                         if (!pixelmask[x])
3419                                 continue;
3420                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3421                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3422                         glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3423                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3424                         pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3425                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3426                 }
3427         }
3428         else
3429         {
3430                 for (x = startx;x < endx;x++)
3431                 {
3432                         __m128i color, lightmap, pix;
3433                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3434                         {
3435                                 __m128i pix2;
3436                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3437                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3438                                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3439                                                                           _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3440                                 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3441                                                                            _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3442                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3443                                 x += 3;
3444                                 continue;
3445                         }
3446                         if (!pixelmask[x]) 
3447                                 continue;
3448                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3449                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3450                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3451                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3452                 }
3453         }
3454         if (pixel == buffer_FragColorbgra8)
3455                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3456 #endif
3457 }
3458
3459
3460 void DPSOFTRAST_VertexShader_LightDirection(void);
3461 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3462
3463 void DPSOFTRAST_VertexShader_FakeLight(void)
3464 {
3465         DPSOFTRAST_VertexShader_LightDirection();
3466 }
3467
3468 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3469 {
3470         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3471 }
3472
3473
3474
3475 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3476 {
3477         DPSOFTRAST_VertexShader_LightDirection();
3478         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3479 }
3480
3481 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3482 {
3483         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3484 }
3485
3486
3487
3488 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3489 {
3490         DPSOFTRAST_VertexShader_LightDirection();
3491         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3492 }
3493
3494 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3495 {
3496         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3497 }
3498
3499
3500
3501 void DPSOFTRAST_VertexShader_LightDirection(void)
3502 {
3503         int i;
3504         int numvertices = dpsoftrast.numvertices;
3505         float LightDir[4];
3506         float LightVector[4];
3507         float EyePosition[4];
3508         float EyeVectorModelSpace[4];
3509         float EyeVector[4];
3510         float position[4];
3511         float svector[4];
3512         float tvector[4];
3513         float normal[4];
3514         LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3515         LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3516         LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3517         LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3518         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3519         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3520         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3521         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3522         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3523         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3524         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3525         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3526         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3527         for (i = 0;i < numvertices;i++)
3528         {
3529                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3530                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3531                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3532                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3533                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3534                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3535                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3536                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3537                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3538                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3539                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3540                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3541                 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3542                 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3543                 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3544                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3545                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3546                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3547                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3548                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3549                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3550                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3551                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3552                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3553                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3554                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3555                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3556                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3557                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3558         }
3559         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3560 }
3561
3562 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3563 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3564 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3565 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3566 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3567 #define DPSOFTRAST_Vector3Normalize(v)\
3568 do\
3569 {\
3570         float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3571         if (len)\
3572         {\
3573                 len = 1.0f / len;\
3574                 v[0] *= len;\
3575                 v[1] *= len;\
3576                 v[2] *= len;\
3577         }\
3578 }\
3579 while(0)
3580
3581 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3582 {
3583         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3584         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3585         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3586         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3587         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3588         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3589         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3590         unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3591         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3592         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3593         int x, startx = span->startx, endx = span->endx;
3594         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3595         float LightVectordata[4];
3596         float LightVectorslope[4];
3597         float EyeVectordata[4];
3598         float EyeVectorslope[4];
3599         float VectorSdata[4];
3600         float VectorSslope[4];
3601         float VectorTdata[4];
3602         float VectorTslope[4];
3603         float VectorRdata[4];
3604         float VectorRslope[4];
3605         float z;
3606         float diffusetex[4];
3607         float glosstex[4];
3608         float surfacenormal[4];
3609         float lightnormal[4];
3610         float lightnormal_modelspace[4];
3611         float eyenormal[4];
3612         float specularnormal[4];
3613         float diffuse;
3614         float specular;
3615         float SpecularPower;
3616         int d[4];
3617         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3618         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3619         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3620         Color_Glow[3] = 0.0f;
3621         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3622         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3623         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3624         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3625         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3626         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3627         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3628         Color_Pants[3] = 0.0f;
3629         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3630         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3631         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3632         Color_Shirt[3] = 0.0f;
3633         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3634         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3635         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3636         {
3637                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3638                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3639         }
3640         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3641         {
3642                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3643         }
3644         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3645         {
3646                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3647                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3648                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3649                 Color_Diffuse[3] = 0.0f;
3650                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3651                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3652                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3653                 LightColor[3] = 0.0f;
3654                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3655                 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3656                 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3657                 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3658                 Color_Specular[3] = 0.0f;
3659                 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3660                 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3661                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3662
3663                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3664                 {
3665                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3666                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3667                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3668                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3669                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3670                 }
3671                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3672                 {
3673                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3674                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3675                 }
3676                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3677                 {
3678                         // nothing of this needed
3679                 }
3680                 else
3681                 {
3682                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3683                 }
3684
3685                 for (x = startx;x < endx;x++)
3686                 {
3687                         z = buffer_z[x];
3688                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3689                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3690                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3691                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3692                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3693                         {
3694                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3695                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3696                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3697                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3698                         }
3699                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3700                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3701                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3702                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3703                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3704                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3705                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3706                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3707
3708                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3709                         {
3710                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3711                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3712                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3713                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3714
3715                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3716                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3717                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3718                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3719
3720                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3721                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3722                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3723                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3724
3725                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3726                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3727                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3728                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3729
3730                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3731                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3732
3733                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3734                                 {
3735                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3736                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3737                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3738                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3739                                 }
3740                         }
3741                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3742                         {
3743                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3744                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3745                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3746                                 {
3747                                         float f = 1.0f / 256.0f;
3748                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3749                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3750                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3751                                 }
3752                         }
3753                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3754                         {
3755                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3756                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3757                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3758                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3759
3760                                 LightColor[0] = 1.0;
3761                                 LightColor[1] = 1.0;
3762                                 LightColor[2] = 1.0;
3763                         }
3764                         else
3765                         {
3766                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3767                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3768                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3769                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3770                         }
3771
3772                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3773
3774                         if(thread->shader_exactspecularmath)
3775                         {
3776                                 // reflect lightnormal at surfacenormal, take the negative of that
3777                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3778                                 float f;
3779                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3780                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3781                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3782                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3783
3784                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
3785                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3786                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3787                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3788                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3789
3790                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3791                         }
3792                         else
3793                         {
3794                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3795                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3796                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3797                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3798
3799                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
3800                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
3801                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
3802                                 DPSOFTRAST_Vector3Normalize(specularnormal);
3803
3804                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3805                         }
3806
3807                         specular = pow(specular, SpecularPower * glosstex[3]);
3808                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3809                         {
3810                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3811                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3812                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3813                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3814                         }
3815                         else
3816                         {
3817                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3818                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3819                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3820                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3821                         }
3822
3823                         buffer_FragColorbgra8[x*4+0] = d[0];
3824                         buffer_FragColorbgra8[x*4+1] = d[1];
3825                         buffer_FragColorbgra8[x*4+2] = d[2];
3826                         buffer_FragColorbgra8[x*4+3] = d[3];
3827                 }
3828         }
3829         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3830         {
3831                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3832                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3833                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3834                 Color_Diffuse[3] = 0.0f;
3835                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3836                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3837                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3838                 LightColor[3] = 0.0f;
3839                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3840
3841                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3842                 {
3843                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3844                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3845                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3846                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3847                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3848                 }
3849                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3850                 {
3851                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3852                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3853                 }
3854                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3855                 {
3856                         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3857                 }
3858                 else
3859                 {
3860                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3861                 }
3862
3863                 for (x = startx;x < endx;x++)
3864                 {
3865                         z = buffer_z[x];
3866                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3867                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3868                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3869                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3870                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3871                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3872                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3873                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3874
3875                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3876                         {
3877                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3878                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3879                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3880                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3881
3882                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3883                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3884                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3885                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3886
3887                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3888                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3889                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3890                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3891
3892                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3893                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3894                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3895                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3896
3897                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3898                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3899
3900                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3901                                 {
3902                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3903                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3904                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3905                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3906                                 }
3907                         }
3908                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3909                         {
3910                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3911                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3912                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3913                                 {
3914                                         float f = 1.0f / 256.0f;
3915                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3916                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3917                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3918                                 }
3919                         }
3920                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3921                         {
3922                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3923                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3924                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3925                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3926
3927                                 LightColor[0] = 1.0;
3928                                 LightColor[1] = 1.0;
3929                                 LightColor[2] = 1.0;
3930                         }
3931                         else
3932                         {
3933                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3934                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3935                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3936                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3937                         }
3938
3939                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3940                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3941                         {
3942                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3943                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3944                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3945                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3946                         }
3947                         else
3948                         {
3949                                 d[0] = (int)(                                                + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3950                                 d[1] = (int)(                                                + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3951                                 d[2] = (int)(                                                + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3952                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3953                         }
3954                         buffer_FragColorbgra8[x*4+0] = d[0];
3955                         buffer_FragColorbgra8[x*4+1] = d[1];
3956                         buffer_FragColorbgra8[x*4+2] = d[2];
3957                         buffer_FragColorbgra8[x*4+3] = d[3];
3958                 }
3959         }
3960         else
3961         {
3962                 for (x = startx;x < endx;x++)
3963                 {
3964                         z = buffer_z[x];
3965                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3966                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3967                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3968                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3969
3970                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3971                         {
3972                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3973                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3974                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3975                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3976                         }
3977                         else
3978                         {
3979                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3980                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3981                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3982                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3983                         }
3984                         buffer_FragColorbgra8[x*4+0] = d[0];
3985                         buffer_FragColorbgra8[x*4+1] = d[1];
3986                         buffer_FragColorbgra8[x*4+2] = d[2];
3987                         buffer_FragColorbgra8[x*4+3] = d[3];
3988                 }
3989         }
3990         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3991 }
3992
3993
3994
3995 void DPSOFTRAST_VertexShader_LightSource(void)
3996 {
3997         int i;
3998         int numvertices = dpsoftrast.numvertices;
3999         float LightPosition[4];
4000         float LightVector[4];
4001         float LightVectorModelSpace[4];
4002         float EyePosition[4];
4003         float EyeVectorModelSpace[4];
4004         float EyeVector[4];
4005         float position[4];
4006         float svector[4];
4007         float tvector[4];
4008         float normal[4];
4009         LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4010         LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4011         LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4012         LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4013         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4014         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4015         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4016         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4017         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4018         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4019         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4020         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4021         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4022         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4023         for (i = 0;i < numvertices;i++)
4024         {
4025                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4026                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4027                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4028                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4029                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4030                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4031                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4032                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4033                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4034                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4035                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4036                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4037                 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4038                 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4039                 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4040                 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4041                 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4042                 LightVector[2] = normal[0]  * LightVectorModelSpace[0] + normal[1]  * LightVectorModelSpace[1] + normal[2]  * LightVectorModelSpace[2];
4043                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4044                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4045                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4046                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4047                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4048                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4049                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4050                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4051                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4052                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
4053                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4054                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4055                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4056                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4057         }
4058         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4059         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4060 }
4061
4062 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4063 {
4064 #ifdef SSE2_PRESENT
4065         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4066         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4067         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4068         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4069         unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4070         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4071         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4072         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4073         int x, startx = span->startx, endx = span->endx;
4074         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4075         float CubeVectordata[4];
4076         float CubeVectorslope[4];
4077         float LightVectordata[4];
4078         float LightVectorslope[4];
4079         float EyeVectordata[4];
4080         float EyeVectorslope[4];
4081         float z;
4082         float diffusetex[4];
4083         float glosstex[4];
4084         float surfacenormal[4];
4085         float lightnormal[4];
4086         float eyenormal[4];
4087         float specularnormal[4];
4088         float diffuse;
4089         float specular;
4090         float SpecularPower;
4091         float CubeVector[4];
4092         float attenuation;
4093         int d[4];
4094         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4095         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4096         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4097         Color_Glow[3] = 0.0f;
4098         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4099         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4100         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4101         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4102         Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4103         Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4104         Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4105         Color_Diffuse[3] = 0.0f;
4106         Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4107         Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4108         Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4109         Color_Specular[3] = 0.0f;
4110         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4111         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4112         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4113         Color_Pants[3] = 0.0f;
4114         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4115         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4116         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4117         Color_Shirt[3] = 0.0f;
4118         LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4119         LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4120         LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4121         LightColor[3] = 0.0f;
4122         SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4123         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4124         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4125         DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4126         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4127         memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4128         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4129         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4130         {
4131                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4132                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4133         }
4134         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4135                 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4136         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4137         {
4138                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4139                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4140                 for (x = startx;x < endx;x++)
4141                 {
4142                         z = buffer_z[x];
4143                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4144                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4145                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4146                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4147                         if (attenuation < 0.01f)
4148                                 continue;
4149                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4150                         {
4151                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4152                                 if (attenuation < 0.01f)
4153                                         continue;
4154                         }
4155
4156                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4157                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4158                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4159                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4160                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4161                         {
4162                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4163                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4164                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4165                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4166                         }
4167                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4168                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4169                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4170                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4171                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4172                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4173                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4174                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4175
4176                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4177                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4178                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4179                         DPSOFTRAST_Vector3Normalize(lightnormal);
4180
4181                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4182
4183                         if(thread->shader_exactspecularmath)
4184                         {
4185                                 // reflect lightnormal at surfacenormal, take the negative of that
4186                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4187                                 float f;
4188                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4189                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4190                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4191                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4192
4193                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
4194                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4195                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4196                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4197                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4198
4199                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4200                         }
4201                         else
4202                         {
4203                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4204                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4205                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4206                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4207
4208                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
4209                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
4210                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
4211                                 DPSOFTRAST_Vector3Normalize(specularnormal);
4212
4213                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4214                         }
4215                         specular = pow(specular, SpecularPower * glosstex[3]);
4216
4217                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4218                         {
4219                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4220                                 attenuation *= (1.0f / 255.0f);
4221                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4222                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4223                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4224                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4225                         }
4226                         else
4227                         {
4228                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4229                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4230                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4231                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4232                         }
4233                         buffer_FragColorbgra8[x*4+0] = d[0];
4234                         buffer_FragColorbgra8[x*4+1] = d[1];
4235                         buffer_FragColorbgra8[x*4+2] = d[2];
4236                         buffer_FragColorbgra8[x*4+3] = d[3];
4237                 }
4238         }
4239         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4240         {
4241                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4242                 for (x = startx;x < endx;x++)
4243                 {
4244                         z = buffer_z[x];
4245                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4246                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4247                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4248                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4249                         if (attenuation < 0.01f)
4250                                 continue;
4251                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4252                         {
4253                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4254                                 if (attenuation < 0.01f)
4255                                         continue;
4256                         }
4257
4258                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4259                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4260                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4261                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4262                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4263                         {
4264                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4265                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4266                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4267                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4268                         }
4269                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4270                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4271                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4272                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4273
4274                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4275                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4276                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4277                         DPSOFTRAST_Vector3Normalize(lightnormal);
4278
4279                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4280                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4281                         {
4282                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4283                                 attenuation *= (1.0f / 255.0f);
4284                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4285                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4286                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4287                                 d[3] = (int)( diffusetex[3]                                                                                                                   );if (d[3] > 255) d[3] = 255;
4288                         }
4289                         else
4290                         {
4291                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4292                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4293                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4294                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4295                         }
4296                         buffer_FragColorbgra8[x*4+0] = d[0];
4297                         buffer_FragColorbgra8[x*4+1] = d[1];
4298                         buffer_FragColorbgra8[x*4+2] = d[2];
4299                         buffer_FragColorbgra8[x*4+3] = d[3];
4300                 }
4301         }
4302         else
4303         {
4304                 for (x = startx;x < endx;x++)
4305                 {
4306                         z = buffer_z[x];
4307                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4308                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4309                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4310                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4311                         if (attenuation < 0.01f)
4312                                 continue;
4313                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4314                         {
4315                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4316                                 if (attenuation < 0.01f)
4317                                         continue;
4318                         }
4319
4320                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4321                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4322                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4323                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4324                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4325                         {
4326                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4327                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4328                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4329                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4330                         }
4331                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4332                         {
4333                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4334                                 attenuation *= (1.0f / 255.0f);
4335                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4336                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4337                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4338                                 d[3] = (int)( diffusetex[3]                                                                                      );if (d[3] > 255) d[3] = 255;
4339                         }
4340                         else
4341                         {
4342                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4343                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4344                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4345                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4346                         }
4347                         buffer_FragColorbgra8[x*4+0] = d[0];
4348                         buffer_FragColorbgra8[x*4+1] = d[1];
4349                         buffer_FragColorbgra8[x*4+2] = d[2];
4350                         buffer_FragColorbgra8[x*4+3] = d[3];
4351                 }
4352         }
4353         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4354 #endif
4355 }
4356
4357
4358
4359 void DPSOFTRAST_VertexShader_Refraction(void)
4360 {
4361         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4362         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4363         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4364 }
4365
4366 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4367 {
4368         // DIRTY TRICK: only do sideways displacement. Not correct, but cheaper and thus better for SW.
4369
4370         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4371         float z;
4372         int x, startx = span->startx, endx = span->endx;
4373
4374         // texture reads
4375         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4376         //unsigned char buffer_texture_refractionbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4377         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4378
4379         // varyings
4380         float ModelViewProjectionPositiondata[4];
4381         float ModelViewProjectionPositionslope[4];
4382
4383         // uniforms
4384         float ScreenScaleRefractReflect[2];
4385         float ScreenCenterRefractReflect[2];
4386         float DistortScaleRefractReflect[2];
4387         float RefractColor[4];
4388
4389         const unsigned char * RESTRICT pixelbase;
4390         const unsigned char * RESTRICT pixel[4];
4391         DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4392         if(!texture) return;
4393         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[0][0];
4394
4395         // read textures
4396         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4397         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4398         //DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_refractionbgra8, GL20TU_REFRACTION, DPSOFTRAST_ARRAY_TEXCOORD1, buffer_z);
4399
4400         // read varyings
4401         DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD1); // or POSITION?
4402
4403         // read uniforms
4404         ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4405         ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4406         ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4407         ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4408         DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4409         DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4410         RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4411         RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4412         RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4413         RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4414
4415         // do stuff
4416         for (x = startx;x < endx;x++)
4417         {
4418                 float SafeScreenTexCoord[2];
4419                 float ScreenTexCoord[2];
4420                 float v[3];
4421                 float iw;
4422                 unsigned char c[4];
4423
4424                 z = buffer_z[x];
4425
4426                 // "    vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4427                 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4428         
4429                 // "    vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4430                 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4431                 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4432
4433                 // "    vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4434                 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4435                 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4436                 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4437                 DPSOFTRAST_Vector3Normalize(v);
4438                 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4439                 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4440
4441                 // "    dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4442                 if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4443                 {
4444                         unsigned int tc[2] = { ScreenTexCoord[0] * (texture->mipmap[0][2]<<16) - 32768, ScreenTexCoord[1] * (texture->mipmap[0][3]<<16) - 32678};
4445                         unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
4446                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
4447                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
4448                         int tci[2] = { tc[0]>>16, tc[1]>>16 };
4449                         int tci1[2] = { tci[0] + 1, tci[1] + 1 };
4450                         tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4451                         tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4452                         tci1[0] = tci1[0] >= 0 ? (tci1[0] <= texture->mipmap[0][2]-1 ? tci1[0] : texture->mipmap[0][2]-1) : 0;
4453                         tci1[1] = tci1[1] >= 0 ? (tci1[1] <= texture->mipmap[0][3]-1 ? tci1[1] : texture->mipmap[0][3]-1) : 0;
4454                         pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4455                         pixel[1] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci1[0]);
4456                         pixel[2] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci[0]);
4457                         pixel[3] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci1[0]);
4458                         c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
4459                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
4460                         c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
4461                 }
4462                 else
4463                 {
4464                         int tci[2] = { ScreenTexCoord[0] * texture->mipmap[0][2] - 0.5, ScreenTexCoord[1] * texture->mipmap[0][3] - 0.5 };
4465                         int tci1[2] = { tci[0] + 1, tci[1] + 1 };
4466                         tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4467                         tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4468                         tci1[0] = tci1[0] >= 0 ? (tci1[0] <= texture->mipmap[0][2]-1 ? tci1[0] : texture->mipmap[0][2]-1) : 0;
4469                         tci1[1] = tci1[1] >= 0 ? (tci1[1] <= texture->mipmap[0][3]-1 ? tci1[1] : texture->mipmap[0][3]-1) : 0;
4470                         pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4471                         c[0] = pixel[0][0];
4472                         c[1] = pixel[0][1];
4473                         c[2] = pixel[0][2];
4474                 }
4475
4476                 //p = (int) bound(startx, x + (ScreenTexCoord[0] - SafeScreenTexCoord[0]) / (ModelViewProjectionPositionslope[0]*z), endx-1);
4477                 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4478                 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4479                 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4480                 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4481         }
4482
4483         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4484 }
4485
4486
4487
4488 void DPSOFTRAST_VertexShader_Water(void)
4489 {
4490         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4491 }
4492
4493
4494 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4495 {
4496         // TODO: IMPLEMENT
4497         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4498         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4499         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4500         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4501         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4502 }
4503
4504
4505
4506 void DPSOFTRAST_VertexShader_ShowDepth(void)
4507 {
4508         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4509 }
4510
4511 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4512 {
4513         // TODO: IMPLEMENT
4514         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4515         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4516         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4517         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4518         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4519 }
4520
4521
4522
4523 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4524 {
4525         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4526 }
4527
4528 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4529 {
4530         // TODO: IMPLEMENT
4531         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4532         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4533         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4534         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4535         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4536 }
4537
4538
4539
4540 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4541 {
4542         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4543 }
4544
4545 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4546 {
4547         // TODO: IMPLEMENT
4548         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4549         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4550         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4551         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4552         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4553 }
4554
4555
4556
4557 typedef struct DPSOFTRAST_ShaderModeInfo_s
4558 {
4559         int lodarrayindex;
4560         void (*Vertex)(void);
4561         void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4562         unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4563         unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4564 }
4565 DPSOFTRAST_ShaderModeInfo;
4566
4567 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4568 {
4569         {2, DPSOFTRAST_VertexShader_Generic,                        DPSOFTRAST_PixelShader_Generic,                        {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4570         {2, DPSOFTRAST_VertexShader_PostProcess,                    DPSOFTRAST_PixelShader_PostProcess,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4571         {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow,                DPSOFTRAST_PixelShader_Depth_Or_Shadow,                {~0}, {~0}},
4572         {2, DPSOFTRAST_VertexShader_FlatColor,                      DPSOFTRAST_PixelShader_FlatColor,                      {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4573         {2, DPSOFTRAST_VertexShader_VertexColor,                    DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4574         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4575         {2, DPSOFTRAST_VertexShader_FakeLight,                      DPSOFTRAST_PixelShader_FakeLight,                      {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4576         {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace,   {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4577         {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4578         {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,                 {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4579         {2, DPSOFTRAST_VertexShader_LightSource,                    DPSOFTRAST_PixelShader_LightSource,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4580         {2, DPSOFTRAST_VertexShader_Refraction,                     DPSOFTRAST_PixelShader_Refraction,                     {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4581         {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                          {~0}},
4582         {2, DPSOFTRAST_VertexShader_ShowDepth,                      DPSOFTRAST_PixelShader_ShowDepth,                      {~0}},
4583         {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,               {~0}},
4584         {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}},
4585 };
4586
4587 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4588 {
4589         int i;
4590         int x;
4591         int startx;
4592         int endx;
4593 //      unsigned int c;
4594 //      unsigned int *colorpixel;
4595         unsigned int *depthpixel;
4596         float w;
4597         float wslope;
4598         int depth;
4599         int depthslope;
4600         unsigned int d;
4601         DPSOFTRAST_State_Triangle *triangle;
4602         DPSOFTRAST_State_Span *span;
4603         unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4604         for (i = 0; i < thread->numspans; i++)
4605         {
4606                 span = &thread->spans[i];
4607                 triangle = &thread->triangles[span->triangle];
4608                 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4609                 {
4610                         wslope = triangle->w[0];
4611                         w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4612                         depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4613                         depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4614                         depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4615                         startx = span->startx;
4616                         endx = span->endx;
4617                         switch(thread->fb_depthfunc)
4618                         {
4619                         default:
4620                         case GL_ALWAYS:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4621                         case GL_LESS:    for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4622                         case GL_LEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4623                         case GL_EQUAL:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4624                         case GL_GEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4625                         case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4626                         case GL_NEVER:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4627                         }
4628                         //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4629                         //for (x = startx;x < endx;x++)
4630                         //      colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4631                         // if there is no color buffer, skip pixel shader
4632                         while (startx < endx && !pixelmask[startx])
4633                                 startx++;
4634                         while (endx > startx && !pixelmask[endx-1])
4635                                 endx--;
4636                         if (startx >= endx)
4637                                 continue; // no pixels to fill
4638                         span->pixelmask = pixelmask;
4639                         span->startx = startx;
4640                         span->endx = endx;
4641                         // run pixel shader if appropriate
4642                         // do this before running depthmask code, to allow the pixelshader
4643                         // to clear pixelmask values for alpha testing
4644                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4645                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4646                         if (thread->depthmask)
4647                                 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4648                                         if (pixelmask[x])
4649                                                 depthpixel[x] = d;
4650                 }
4651                 else
4652                 {
4653                         // no depth testing means we're just dealing with color...
4654                         // if there is no color buffer, skip pixel shader
4655                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4656                         {
4657                                 memset(pixelmask + span->startx, 1, span->endx - span->startx);
4658                                 span->pixelmask = pixelmask;
4659                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4660                         }
4661                 }
4662         }
4663         thread->numspans = 0;
4664 }
4665
4666 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4667
4668 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4669 {
4670 #ifdef SSE2_PRESENT
4671         int cullface = thread->cullface;
4672         int minx, maxx, miny, maxy;
4673         int miny1, maxy1, miny2, maxy2;
4674         __m128i fbmin, fbmax;
4675         __m128 viewportcenter, viewportscale;
4676         int firstvertex = command->firstvertex;
4677         int numvertices = command->numvertices;
4678         int numtriangles = command->numtriangles;
4679         const int *element3i = command->element3i;
4680         const unsigned short *element3s = command->element3s;
4681         int clipped = command->clipped;
4682         int i;
4683         int j;
4684         int k;
4685         int y;
4686         int e[3];
4687         __m128i screeny;
4688         int starty, endy, bandy;
4689         int numpoints;
4690         int clipcase;
4691         float clipdist[4];
4692         __m128 triangleedge1, triangleedge2, trianglenormal;
4693         __m128 clipfrac[3];
4694         __m128 screen[4];
4695         DPSOFTRAST_State_Triangle *triangle;
4696         DPSOFTRAST_Texture *texture;
4697         DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4698         miny = thread->fb_scissor[1];
4699         maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4700         miny1 = bound(miny, thread->miny1, maxy);
4701         maxy1 = bound(miny, thread->maxy1, maxy);
4702         miny2 = bound(miny, thread->miny2, maxy);
4703         maxy2 = bound(miny, thread->maxy2, maxy);
4704         if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4705         {
4706                 if (!ATOMIC_DECREMENT(command->refcount))
4707                 {
4708                         if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4709                                 MM_FREE(command->arrays);
4710                 }
4711                 return;
4712         }
4713         minx = thread->fb_scissor[0];
4714         maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4715         fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4716         fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4717         viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4718         viewportscale = _mm_load_ps(thread->fb_viewportscale);
4719         screen[3] = _mm_setzero_ps();
4720         clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4721         for (i = 0;i < numtriangles;i++)
4722         {
4723                 const float *screencoord4f = command->arrays;
4724                 const float *arrays = screencoord4f + numvertices*4;
4725
4726                 // generate the 3 edges of this triangle
4727                 // generate spans for the triangle - switch based on left split or right split classification of triangle
4728                 if (element3s)
4729                 {
4730                         e[0] = element3s[i*3+0] - firstvertex;
4731                         e[1] = element3s[i*3+1] - firstvertex;
4732                         e[2] = element3s[i*3+2] - firstvertex;
4733                 }
4734                 else if (element3i)
4735                 {
4736                         e[0] = element3i[i*3+0] - firstvertex;
4737                         e[1] = element3i[i*3+1] - firstvertex;
4738                         e[2] = element3i[i*3+2] - firstvertex;
4739                 }
4740                 else
4741                 {
4742                         e[0] = i*3+0;
4743                         e[1] = i*3+1;
4744                         e[2] = i*3+2;
4745                 }
4746
4747 #define SKIPBACKFACE \
4748                 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4749                 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4750                 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4751                 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4752                                                                         _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4753                 switch(cullface) \
4754                 { \
4755                 case GL_BACK: \
4756                         if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4757                                 continue; \
4758                         break; \
4759                 case GL_FRONT: \
4760                         if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4761                                 continue; \
4762                         break; \
4763                 }
4764
4765 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4766                         clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4767                         { \
4768                                 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4769                                 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4770                         }
4771 #define CLIPPEDVERTEXCOPY(k,p1) \
4772                         screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4773
4774 #define GENATTRIBCOPY(attrib, p1) \
4775                 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4776 #define GENATTRIBLERP(attrib, p1, p2) \
4777                 { \
4778                         __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4779                         attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4780                 }
4781 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4782                 switch(clipcase) \
4783                 { \
4784                 default: \
4785                 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4786                 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4787                 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4788                 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4789                 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4790                 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4791                 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4792                 }
4793
4794                 if (! clipped)
4795                         goto notclipped;
4796
4797                 // calculate distance from nearplane
4798                 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4799                 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4800                 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4801                 if (clipdist[0] >= 0.0f)
4802                 {
4803                         if (clipdist[1] >= 0.0f)
4804                         {
4805                                 if (clipdist[2] >= 0.0f)
4806                                 {
4807                                 notclipped:
4808                                         // triangle is entirely in front of nearplane
4809                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4810                                         SKIPBACKFACE;
4811                                         numpoints = 3;
4812                                         clipcase = 0;
4813                                 }
4814                                 else
4815                                 {
4816                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4817                                         SKIPBACKFACE;
4818                                         numpoints = 4;
4819                                         clipcase = 1;
4820                                 }
4821                         }
4822                         else
4823                         {
4824                                 if (clipdist[2] >= 0.0f)
4825                                 {
4826                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4827                                         SKIPBACKFACE;
4828                                         numpoints = 4;
4829                                         clipcase = 2;
4830                                 }
4831                                 else
4832                                 {
4833                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4834                                         SKIPBACKFACE;
4835                                         numpoints = 3;
4836                                         clipcase = 3;
4837                                 }
4838                         }
4839                 }
4840                 else if (clipdist[1] >= 0.0f)
4841                 {
4842                         if (clipdist[2] >= 0.0f)
4843                         {
4844                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4845                                 SKIPBACKFACE;
4846                                 numpoints = 4;
4847                                 clipcase = 4;
4848                         }
4849                         else
4850                         {
4851                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4852                                 SKIPBACKFACE;
4853                                 numpoints = 3;
4854                                 clipcase = 5;
4855                         }
4856                 }
4857                 else if (clipdist[2] >= 0.0f)
4858                 {
4859                         CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4860                         SKIPBACKFACE;
4861                         numpoints = 3;
4862                         clipcase = 6;
4863                 }
4864                 else continue; // triangle is entirely behind nearplane
4865
4866                 {
4867                         // calculate integer y coords for triangle points
4868                         __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4869                                         screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4870                                         screenmin = _mm_min_epi16(screeni, screenir),
4871                                         screenmax = _mm_max_epi16(screeni, screenir);
4872                         screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4873                         screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4874                         screenmin = _mm_max_epi16(screenmin, fbmin);
4875                         screenmax = _mm_min_epi16(screenmax, fbmax);
4876                         // skip offscreen triangles
4877                         if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4878                                 continue;
4879                         starty = _mm_extract_epi16(screenmin, 1);
4880                         endy = _mm_extract_epi16(screenmax, 1)+1;
4881                         if (starty >= maxy1 && endy <= miny2)
4882                                 continue;
4883                         screeny = _mm_srai_epi32(screeni, 16);
4884                 }
4885
4886                 triangle = &thread->triangles[thread->numtriangles];
4887
4888                 // calculate attribute plans for triangle data...
4889                 // okay, this triangle is going to produce spans, we'd better project
4890                 // the interpolants now (this is what gives perspective texturing),
4891                 // this consists of simply multiplying all arrays by the W coord
4892                 // (which is basically 1/Z), which will be undone per-pixel
4893                 // (multiplying by Z again) to get the perspective-correct array
4894                 // values
4895                 {
4896                         __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4897                         __m128 mipedgescale, mipdensity;
4898                         attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4899                         attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4900                         attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4901                         attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4902                         attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4903                         w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4904                         w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4905                         w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4906                         attribedge1 = _mm_sub_ss(w0, w1);
4907                         attribedge2 = _mm_sub_ss(w2, w1);
4908                         attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4909                         attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4910                         x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4911                         y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4912                         attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4913                         _mm_store_ss(&triangle->w[0], attribxslope);
4914                         _mm_store_ss(&triangle->w[1], attribyslope);
4915                         _mm_store_ss(&triangle->w[2], attriborigin);
4916                         mipedgescale = _mm_setzero_ps();
4917                         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4918                         {
4919                                 __m128 attrib0, attrib1, attrib2;
4920                                 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4921                                 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4922                                         break;
4923                                 arrays += numvertices*4;
4924                                 GENATTRIBS(attrib0, attrib1, attrib2);
4925                                 attriborigin = _mm_mul_ps(attrib1, w1);
4926                                 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4927                                 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4928                                 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4929                                 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4930                                 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4931                                 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
4932                                 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
4933                                 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
4934                                 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4935                                 {
4936                                         mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4937                                         mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4938                                         mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4939                                         mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4940                                 }
4941                         }
4942
4943                         memset(triangle->mip, 0, sizeof(triangle->mip));
4944                         for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4945                         {
4946                                 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4947                                 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4948                                         break;
4949                                 texture = thread->texbound[texunit];
4950                                 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4951                                 {
4952                                         mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4953                                         mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4954                                         mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4955                                         mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4956                                         // this will be multiplied in the texturing routine by the texture resolution
4957                                         y = _mm_cvtss_si32(mipdensity);
4958                                         if (y > 0)
4959                                         {
4960                                                 y = (int)(log((float)y)*0.5f/M_LN2);
4961                                                 if (y > texture->mipmaps - 1)
4962                                                         y = texture->mipmaps - 1;
4963                                                 triangle->mip[texunit] = y;
4964                                         }
4965                                 }
4966                         }
4967                 }
4968         
4969                 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
4970                 for (; y < bandy;)
4971                 {
4972                         __m128 xcoords, xslope;
4973                         __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
4974                         int yccmask = _mm_movemask_epi8(ycc);
4975                         int edge0p, edge0n, edge1p, edge1n;
4976                         int nexty;
4977                         if (numpoints == 4)
4978                         {
4979                                 switch(yccmask)
4980                                 {
4981                                 default:
4982                                 case 0xFFFF: /*0000*/ y = endy; continue;
4983                                 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
4984                                 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4985                                 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
4986                                 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
4987                                 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
4988                                 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
4989                                 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
4990                                 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
4991                                 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
4992                                 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
4993                                 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
4994                                 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
4995                                 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4996                                 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
4997                                 case 0x0000: /*1111*/ y++; continue;
4998                                 }
4999                         }
5000                         else
5001                         {
5002                                 switch(yccmask)
5003                                 {
5004                                 default:
5005                                 case 0xFFFF: /*000*/ y = endy; continue;
5006                                 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5007                                 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5008                                 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5009                                 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5010                                 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5011                                 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5012                                 case 0x0000: /*111*/ y++; continue;
5013                                 }
5014                         }
5015                         ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5016                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5017                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5018                         nexty = _mm_extract_epi16(ycc, 0);
5019                         if (nexty >= bandy) nexty = bandy-1;
5020                         xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5021                         xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5022                         xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5023                                                                 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5024                         xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5025                         if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5026                         {
5027                                 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5028                                 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5029                         }
5030                         for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
5031                         {
5032                                 int startx, endx, offset;
5033                                 startx = _mm_cvtss_si32(xcoords);
5034                                 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5035                                 if (startx < minx) 
5036                                 {
5037                                         if (startx < 0) startx = 0;
5038                                         startx += (minx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
5039                                 }
5040                                 if (endx > maxx) endx = maxx;
5041                                 if (startx >= endx) continue;
5042                                 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5043                                 {
5044                                         DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5045                                         span->triangle = thread->numtriangles;
5046                                         span->x = offset;
5047                                         span->y = y;
5048                                         span->startx = max(minx - offset, 0);
5049                                         span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5050                                         if (span->startx >= span->endx)
5051                                                 continue; 
5052                                         if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5053                                                 DPSOFTRAST_Draw_ProcessSpans(thread);
5054                                 }
5055                         }
5056                 }
5057
5058                 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5059                 {
5060                         DPSOFTRAST_Draw_ProcessSpans(thread);
5061                         thread->numtriangles = 0;
5062                 }
5063         }
5064
5065         if (!ATOMIC_DECREMENT(command->refcount))
5066         {
5067                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5068                         MM_FREE(command->arrays);
5069         }
5070
5071         if (thread->numspans > 0 || thread->numtriangles > 0)
5072         {
5073                 DPSOFTRAST_Draw_ProcessSpans(thread);
5074                 thread->numtriangles = 0;
5075         }
5076 #endif
5077 }
5078
5079 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5080 {
5081         int i;
5082         int j;
5083         int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5084         int datasize = 2*numvertices*sizeof(float[4]);
5085         DPSOFTRAST_Command_Draw *command;
5086         unsigned char *data;
5087         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5088         {
5089                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5090                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5091                         break;
5092                 datasize += numvertices*sizeof(float[4]);
5093         }
5094         if (element3s)
5095                 datasize += numtriangles*sizeof(unsigned short[3]);
5096         else if (element3i)
5097                 datasize += numtriangles*sizeof(int[3]);
5098         datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5099         if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5100         {
5101                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5102                 data = (unsigned char *)MM_CALLOC(datasize, 1);
5103         }
5104         else
5105         {
5106                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5107                 data = (unsigned char *)command + commandsize;
5108         }
5109         command->firstvertex = firstvertex;
5110         command->numvertices = numvertices;
5111         command->numtriangles = numtriangles;
5112         command->arrays = (float *)data;
5113         memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5114         dpsoftrast.firstvertex = firstvertex;
5115         dpsoftrast.numvertices = numvertices;
5116         dpsoftrast.screencoord4f = (float *)data;
5117         data += numvertices*sizeof(float[4]);
5118         dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5119         data += numvertices*sizeof(float[4]);
5120         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5121         {
5122                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5123                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5124                         break;
5125                 dpsoftrast.post_array4f[j] = (float *)data;
5126                 data += numvertices*sizeof(float[4]);
5127         }
5128         command->element3i = NULL;
5129         command->element3s = NULL;
5130         if (element3s)
5131         {
5132                 command->element3s = (unsigned short *)data;
5133                 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5134         }
5135         else if (element3i)
5136         {
5137                 command->element3i = (int *)data;
5138                 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5139         }
5140         return command;
5141 }
5142
5143 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5144 {
5145         DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5146         DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5147         command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5148         command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5149         if (command->starty >= command->endy)
5150         {
5151                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5152                         MM_FREE(command->arrays);
5153                 DPSOFTRAST_UndoCommand(command->commandsize);
5154                 return;
5155         }
5156         command->clipped = dpsoftrast.drawclipped;
5157         command->refcount = dpsoftrast.numthreads;
5158
5159         if (dpsoftrast.usethreads)
5160         {
5161                 int i;
5162                 DPSOFTRAST_Draw_SyncCommands();
5163                 for (i = 0; i < dpsoftrast.numthreads; i++)
5164                 {
5165                         DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5166                         if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5167                                 Thread_CondSignal(thread->drawcond);
5168                 }
5169         }
5170         else
5171         {
5172                 DPSOFTRAST_Draw_FlushThreads();
5173         }
5174 }
5175
5176 DEFCOMMAND(23, SetRenderTargets, int width; int height;);
5177 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5178 {
5179         thread->validate |= DPSOFTRAST_VALIDATE_FB;
5180 }
5181 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5182 {
5183         DPSOFTRAST_Command_SetRenderTargets *command;
5184         if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5185                 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5186                 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5187                 DPSOFTRAST_Flush();
5188         dpsoftrast.fb_width = width;
5189         dpsoftrast.fb_height = height;
5190         dpsoftrast.fb_depthpixels = depthpixels;
5191         dpsoftrast.fb_colorpixels[0] = colorpixels0;
5192         dpsoftrast.fb_colorpixels[1] = colorpixels1;
5193         dpsoftrast.fb_colorpixels[2] = colorpixels2;
5194         dpsoftrast.fb_colorpixels[3] = colorpixels3;
5195         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5196         command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5197         command->width = width;
5198         command->height = height;
5199 }
5200  
5201 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5202 {
5203         int commandoffset = thread->commandoffset;
5204         while (commandoffset != endoffset)
5205         {
5206                 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5207                 switch (command->opcode)
5208                 {
5209 #define INTERPCOMMAND(name) \
5210                 case DPSOFTRAST_OPCODE_##name : \
5211                         DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5212                         commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5213                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5214                                 commandoffset = 0; \
5215                         break;
5216                 INTERPCOMMAND(Viewport)
5217                 INTERPCOMMAND(ClearColor)
5218                 INTERPCOMMAND(ClearDepth)
5219                 INTERPCOMMAND(ColorMask)
5220                 INTERPCOMMAND(DepthTest)
5221                 INTERPCOMMAND(ScissorTest)
5222                 INTERPCOMMAND(Scissor)
5223                 INTERPCOMMAND(BlendFunc)
5224                 INTERPCOMMAND(BlendSubtract)
5225                 INTERPCOMMAND(DepthMask)
5226                 INTERPCOMMAND(DepthFunc)
5227                 INTERPCOMMAND(DepthRange)
5228                 INTERPCOMMAND(PolygonOffset)
5229                 INTERPCOMMAND(CullFace)
5230                 INTERPCOMMAND(AlphaTest)
5231                 INTERPCOMMAND(AlphaFunc)
5232                 INTERPCOMMAND(SetTexture)
5233                 INTERPCOMMAND(SetShader)
5234                 INTERPCOMMAND(Uniform4f)
5235                 INTERPCOMMAND(UniformMatrix4f)
5236                 INTERPCOMMAND(Uniform1i)
5237                 INTERPCOMMAND(SetRenderTargets)
5238
5239                 case DPSOFTRAST_OPCODE_Draw:
5240                         DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5241                         commandoffset += command->commandsize;
5242                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5243                                 commandoffset = 0;
5244                         thread->commandoffset = commandoffset;
5245                         break;
5246
5247                 case DPSOFTRAST_OPCODE_Reset:
5248                         commandoffset = 0;
5249                         break;
5250                 }
5251         }
5252         thread->commandoffset = commandoffset;
5253 }
5254
5255 static int DPSOFTRAST_Draw_Thread(void *data)
5256 {
5257         DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5258         while(thread->index >= 0)
5259         {
5260                 if (thread->commandoffset != dpsoftrast.drawcommand)
5261                 {
5262                         DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);      
5263                 }
5264                 else 
5265                 {
5266                         Thread_LockMutex(thread->drawmutex);
5267                         if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5268                         {
5269                                 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5270                                 thread->starving = true;
5271                                 Thread_CondWait(thread->drawcond, thread->drawmutex);
5272                                 thread->starving = false;
5273                         }
5274                         Thread_UnlockMutex(thread->drawmutex);
5275                 }
5276         }   
5277         return 0;
5278 }
5279
5280 static void DPSOFTRAST_Draw_FlushThreads(void)
5281 {
5282         DPSOFTRAST_State_Thread *thread;
5283         int i;
5284         DPSOFTRAST_Draw_SyncCommands();
5285         if (dpsoftrast.usethreads) 
5286         {
5287                 for (i = 0; i < dpsoftrast.numthreads; i++)
5288                 {
5289                         thread = &dpsoftrast.threads[i];
5290                         if (thread->commandoffset != dpsoftrast.drawcommand)
5291                         {
5292                                 Thread_LockMutex(thread->drawmutex);
5293                                 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5294                                         Thread_CondSignal(thread->drawcond);
5295                                 Thread_UnlockMutex(thread->drawmutex);
5296                         }
5297                 }
5298                 for (i = 0; i < dpsoftrast.numthreads; i++)
5299                 {
5300                         thread = &dpsoftrast.threads[i];
5301                         if (thread->commandoffset != dpsoftrast.drawcommand)
5302                         {
5303                                 Thread_LockMutex(thread->drawmutex);
5304                                 if (thread->commandoffset != dpsoftrast.drawcommand)
5305                                 {
5306                                         thread->waiting = true;
5307                                         Thread_CondWait(thread->waitcond, thread->drawmutex);
5308                                         thread->waiting = false;
5309                                 }
5310                                 Thread_UnlockMutex(thread->drawmutex);
5311                         }
5312                 }
5313         }
5314         else
5315         {
5316                 for (i = 0; i < dpsoftrast.numthreads; i++)
5317                 {
5318                         thread = &dpsoftrast.threads[i];
5319                         if (thread->commandoffset != dpsoftrast.drawcommand)
5320                                 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5321                 }
5322         }
5323         dpsoftrast.commandpool.usedcommands = 0;
5324 }
5325
5326 void DPSOFTRAST_Flush(void)
5327 {
5328         DPSOFTRAST_Draw_FlushThreads();
5329 }
5330
5331 void DPSOFTRAST_Finish(void)
5332 {
5333         DPSOFTRAST_Flush();
5334 }
5335
5336 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5337 {
5338         int i;
5339         union
5340         {
5341                 int i;
5342                 unsigned char b[4];
5343         }
5344         u;
5345         u.i = 1;
5346         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5347         dpsoftrast.bigendian = u.b[3];
5348         dpsoftrast.fb_width = width;
5349         dpsoftrast.fb_height = height;
5350         dpsoftrast.fb_depthpixels = depthpixels;
5351         dpsoftrast.fb_colorpixels[0] = colorpixels;
5352         dpsoftrast.fb_colorpixels[1] = NULL;
5353         dpsoftrast.fb_colorpixels[1] = NULL;
5354         dpsoftrast.fb_colorpixels[1] = NULL;
5355         dpsoftrast.viewport[0] = 0;
5356         dpsoftrast.viewport[1] = 0;
5357         dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5358         dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5359         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5360         dpsoftrast.texture_firstfree = 1;
5361         dpsoftrast.texture_end = 1;
5362         dpsoftrast.texture_max = 0;
5363         dpsoftrast.color[0] = 1;
5364         dpsoftrast.color[1] = 1;
5365         dpsoftrast.color[2] = 1;
5366         dpsoftrast.color[3] = 1;
5367         dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5368         dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5369         dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5370         dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5371         for (i = 0; i < dpsoftrast.numthreads; i++)
5372         {
5373                 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5374                 thread->index = i;
5375                 thread->cullface = GL_BACK;
5376                 thread->colormask[1] = 1;
5377                 thread->colormask[2] = 1;
5378                 thread->colormask[3] = 1;
5379                 thread->blendfunc[0] = GL_ONE;
5380                 thread->blendfunc[1] = GL_ZERO;
5381                 thread->depthmask = true;
5382                 thread->depthtest = true;
5383                 thread->depthfunc = GL_LEQUAL;
5384                 thread->scissortest = false;
5385                 thread->alphatest = false;
5386                 thread->alphafunc = GL_GREATER;
5387                 thread->alphavalue = 0.5f;
5388                 thread->viewport[0] = 0;
5389                 thread->viewport[1] = 0;
5390                 thread->viewport[2] = dpsoftrast.fb_width;
5391                 thread->viewport[3] = dpsoftrast.fb_height;
5392                 thread->scissor[0] = 0;
5393                 thread->scissor[1] = 0;
5394                 thread->scissor[2] = dpsoftrast.fb_width;
5395                 thread->scissor[3] = dpsoftrast.fb_height;
5396                 thread->depthrange[0] = 0;
5397                 thread->depthrange[1] = 1;
5398                 thread->polygonoffset[0] = 0;
5399                 thread->polygonoffset[1] = 0;
5400         
5401                 DPSOFTRAST_RecalcThread(thread);
5402         
5403                 thread->numspans = 0;
5404                 thread->numtriangles = 0;
5405                 thread->commandoffset = 0;
5406                 thread->waiting = false;
5407                 thread->starving = false;
5408            
5409                 thread->validate = -1;
5410                 DPSOFTRAST_Validate(thread, -1);
5411  
5412                 if (dpsoftrast.usethreads)
5413                 {
5414                         thread->waitcond = Thread_CreateCond();
5415                         thread->drawcond = Thread_CreateCond();
5416                         thread->drawmutex = Thread_CreateMutex();
5417                         thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5418                 }
5419         }
5420         return 0;
5421 }
5422
5423 void DPSOFTRAST_Shutdown(void)
5424 {
5425         int i;
5426         if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5427         {
5428                 DPSOFTRAST_State_Thread *thread;
5429                 for (i = 0; i < dpsoftrast.numthreads; i++)
5430                 {
5431                         thread = &dpsoftrast.threads[i];
5432                         Thread_LockMutex(thread->drawmutex);
5433                         thread->index = -1;
5434                         Thread_CondSignal(thread->drawcond);
5435                         Thread_UnlockMutex(thread->drawmutex);
5436                         Thread_WaitThread(thread->thread, 0);
5437                         Thread_DestroyCond(thread->waitcond);
5438                         Thread_DestroyCond(thread->drawcond);
5439                         Thread_DestroyMutex(thread->drawmutex);
5440                 }
5441         }
5442         for (i = 0;i < dpsoftrast.texture_end;i++)
5443                 if (dpsoftrast.texture[i].bytes)
5444                         MM_FREE(dpsoftrast.texture[i].bytes);
5445         if (dpsoftrast.texture)
5446                 free(dpsoftrast.texture);
5447         if (dpsoftrast.threads)
5448                 MM_FREE(dpsoftrast.threads);
5449         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5450 }
5451