]> git.xonotic.org Git - xonotic/darkplaces.git/blob - dpsoftrast.c
WGL client can now use DPSOFTRAST, added thread_win.c to avoid SDL dependency for...
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "thread.h"
7 #include "dpsoftrast.h"
8
9 #ifdef _MSC_VER
10 #pragma warning(disable : 4324)
11 #endif
12
13 #ifndef __cplusplus
14 typedef qboolean bool;
15 #endif
16
17 #define ALIGN_SIZE 16
18 #define ATOMIC_SIZE 32
19
20 #ifdef SSE_POSSIBLE
21         #if defined(__APPLE__)
22                 #include <libkern/OSAtomic.h>
23                 #define ALIGN(var) var __attribute__((__aligned__(16)))
24                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
25                 #define MEMORY_BARRIER (_mm_sfence())
26                 #define ATOMIC_COUNTER volatile int32_t 
27                 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28                 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29                 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30         #elif defined(__GNUC__)
31                 #define ALIGN(var) var __attribute__((__aligned__(16)))
32                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
33                 #define MEMORY_BARRIER (_mm_sfence())
34                 //(__sync_synchronize())
35                 #define ATOMIC_COUNTER volatile int
36                 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
37                 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
38                 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
39         #elif defined(_MSC_VER)
40                 #define ALIGN(var) __declspec(align(16)) var
41                 #define ATOMIC(var) __declspec(align(32)) var
42                 #define MEMORY_BARRIER (_mm_sfence())
43                 //(MemoryBarrier())
44                 #define ATOMIC_COUNTER volatile LONG
45                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
46                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
47                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
48         #endif
49 #endif
50
51 #ifndef ALIGN
52 #define ALIGN(var) var
53 #endif
54 #ifndef ATOMIC
55 #define ATOMIC(var) var
56 #endif
57 #ifndef MEMORY_BARRIER
58 #define MEMORY_BARRIER ((void)0)
59 #endif
60 #ifndef ATOMIC_COUNTER
61 #define ATOMIC_COUNTER int
62 #endif
63 #ifndef ATOMIC_INCREMENT
64 #define ATOMIC_INCREMENT(counter) (++(counter))
65 #endif
66 #ifndef ATOMIC_DECREMENT
67 #define ATOMIC_DECREMENT(counter) (--(counter))
68 #endif
69 #ifndef ATOMIC_ADD
70 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
71 #endif
72
73 #ifdef SSE_POSSIBLE
74 #include <emmintrin.h>
75
76 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
77
78 static void *MM_CALLOC(size_t nmemb, size_t size)
79 {
80         void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
81         if (ptr != NULL) memset(ptr, 0, nmemb*size);
82         return ptr;
83 }
84
85 #define MM_FREE _mm_free
86 #else
87 #define MM_MALLOC(size) malloc(size)
88 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
89 #define MM_FREE free
90 #endif
91
92 typedef enum DPSOFTRAST_ARRAY_e
93 {
94         DPSOFTRAST_ARRAY_POSITION,
95         DPSOFTRAST_ARRAY_COLOR,
96         DPSOFTRAST_ARRAY_TEXCOORD0,
97         DPSOFTRAST_ARRAY_TEXCOORD1,
98         DPSOFTRAST_ARRAY_TEXCOORD2,
99         DPSOFTRAST_ARRAY_TEXCOORD3,
100         DPSOFTRAST_ARRAY_TEXCOORD4,
101         DPSOFTRAST_ARRAY_TEXCOORD5,
102         DPSOFTRAST_ARRAY_TEXCOORD6,
103         DPSOFTRAST_ARRAY_TEXCOORD7,
104         DPSOFTRAST_ARRAY_TOTAL
105 }
106 DPSOFTRAST_ARRAY;
107
108 typedef struct DPSOFTRAST_Texture_s
109 {
110         int flags;
111         int width;
112         int height;
113         int depth;
114         int sides;
115         DPSOFTRAST_TEXTURE_FILTER filter;
116         int mipmaps;
117         int size;
118         ATOMIC_COUNTER binds;
119         unsigned char *bytes;
120         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
121 }
122 DPSOFTRAST_Texture;
123
124 #define COMMAND_SIZE ALIGN_SIZE
125 #define COMMAND_ALIGN(var) ALIGN(var)
126
127 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
128 {
129         unsigned char opcode;
130         unsigned short commandsize;
131 }
132 DPSOFTRAST_Command);
133
134 enum { DPSOFTRAST_OPCODE_Reset = 0 };
135
136 #define DEFCOMMAND(opcodeval, name, fields) \
137         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
138         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
139         { \
140                 unsigned char opcode; \
141                 unsigned short commandsize; \
142                 fields \
143         } DPSOFTRAST_Command_##name );
144
145 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
146 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
147
148 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
149 {
150         int freecommand;
151         int usedcommands;
152         ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
153 }
154 DPSOFTRAST_State_Command_Pool);
155
156 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
157 {
158         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
159         float w[3];
160         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
161 }
162 DPSOFTRAST_State_Triangle);
163
164 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
165         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
166         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
167                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
168                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
169 }
170 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
171         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
172         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
173         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
174         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
175         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
176         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
177         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
178         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
179 }
180                                         
181 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
182
183 typedef ALIGN(struct DPSOFTRAST_State_Span_s
184 {
185         int triangle; // triangle this span was generated by
186         int x; // framebuffer x coord
187         int y; // framebuffer y coord
188         int startx; // usable range (according to pixelmask)
189         int endx; // usable range (according to pixelmask)
190         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
191 }
192 DPSOFTRAST_State_Span);
193
194 #define DPSOFTRAST_DRAW_MAXSPANS 1024
195 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
196
197 #define DPSOFTRAST_VALIDATE_FB 1
198 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
199 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
200 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
201
202 typedef enum DPSOFTRAST_BLENDMODE_e
203 {
204         DPSOFTRAST_BLENDMODE_OPAQUE,
205         DPSOFTRAST_BLENDMODE_ALPHA,
206         DPSOFTRAST_BLENDMODE_ADDALPHA,
207         DPSOFTRAST_BLENDMODE_ADD,
208         DPSOFTRAST_BLENDMODE_INVMOD,
209         DPSOFTRAST_BLENDMODE_MUL,
210         DPSOFTRAST_BLENDMODE_MUL2,
211         DPSOFTRAST_BLENDMODE_SUBALPHA,
212         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
213         DPSOFTRAST_BLENDMODE_INVADD,
214         DPSOFTRAST_BLENDMODE_TOTAL
215 }
216 DPSOFTRAST_BLENDMODE;
217
218 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
219 {
220         void *thread;
221         int index;
222         
223         int cullface;
224         int colormask[4];
225         int blendfunc[2];
226         int blendsubtract;
227         int depthmask;
228         int depthtest;
229         int depthfunc;
230         int scissortest;
231         int alphatest;
232         int alphafunc;
233         float alphavalue;
234         int viewport[4];
235         int scissor[4];
236         float depthrange[2];
237         float polygonoffset[2];
238
239         int shader_mode;
240         int shader_permutation;
241         int shader_exactspecularmath;
242
243         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
244         
245         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
246         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
247
248         // DPSOFTRAST_VALIDATE_ flags
249         int validate;
250
251         // derived values (DPSOFTRAST_VALIDATE_FB)
252         int fb_colormask;
253         int fb_scissor[4];
254         ALIGN(float fb_viewportcenter[4]);
255         ALIGN(float fb_viewportscale[4]);
256
257         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
258         int fb_depthfunc;
259
260         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
261         int fb_blendmode;
262
263         // band boundaries
264         int miny1;
265         int maxy1;
266         int miny2;
267         int maxy2;
268
269         ATOMIC(volatile int commandoffset);
270
271         volatile bool waiting;
272         volatile bool starving;
273         void *waitcond;
274         void *drawcond;
275         void *drawmutex;
276
277         int numspans;
278         int numtriangles;
279         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
280         DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
281 }
282 DPSOFTRAST_State_Thread);
283
284 typedef ATOMIC(struct DPSOFTRAST_State_s
285 {
286         int fb_width;
287         int fb_height;
288         unsigned int *fb_depthpixels;
289         unsigned int *fb_colorpixels[4];
290
291         int viewport[4];
292         ALIGN(float fb_viewportcenter[4]);
293         ALIGN(float fb_viewportscale[4]);
294
295         float color[4];
296         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
297         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
298
299         const float *pointer_vertex3f;
300         const float *pointer_color4f;
301         const unsigned char *pointer_color4ub;
302         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
303         int stride_vertex;
304         int stride_color;
305         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
306         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
307         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
308
309         int firstvertex;
310         int numvertices;
311         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
312         float *screencoord4f;
313         int drawstarty;
314         int drawendy;
315         int drawclipped;
316         
317         int shader_mode;
318         int shader_permutation;
319         int shader_exactspecularmath;
320
321         int texture_max;
322         int texture_end;
323         int texture_firstfree;
324         DPSOFTRAST_Texture *texture;
325
326         int bigendian;
327
328         // error reporting
329         const char *errorstring;
330
331         bool usethreads;
332         int interlace;
333         int numthreads;
334         DPSOFTRAST_State_Thread *threads;
335
336         ATOMIC(volatile int drawcommand);
337
338         DPSOFTRAST_State_Command_Pool commandpool;
339 }
340 DPSOFTRAST_State);
341
342 DPSOFTRAST_State dpsoftrast;
343
344 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
345 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
346 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
347 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
348 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
349
350 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
351 {
352         fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
353         fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
354         fb_viewportcenter[3] = 0.5f;
355         fb_viewportcenter[0] = 0.0f;
356         fb_viewportscale[1] = 0.5f * viewport[2];
357         fb_viewportscale[2] = -0.5f * viewport[3];
358         fb_viewportscale[3] = 0.5f;
359         fb_viewportscale[0] = 1.0f;
360 }
361
362 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
363 {
364         if (dpsoftrast.interlace)
365         {
366                 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
367                 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
368                 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
369                 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
370         }
371         else
372         {
373                 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
374                 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
375         }
376 }
377
378 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
379 {
380         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
381         // and viewport projection values
382         int x1, x2;
383         int y1, y2;
384         x1 = thread->scissor[0];
385         x2 = thread->scissor[0] + thread->scissor[2];
386         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
387         y2 = dpsoftrast.fb_height - thread->scissor[1];
388         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
389         if (x1 < 0) x1 = 0;
390         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
391         if (y1 < 0) y1 = 0;
392         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
393         thread->fb_scissor[0] = x1;
394         thread->fb_scissor[1] = y1;
395         thread->fb_scissor[2] = x2 - x1;
396         thread->fb_scissor[3] = y2 - y1;
397
398         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
399         DPSOFTRAST_RecalcThread(thread);
400 }
401
402 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
403 {
404         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
405 }
406
407 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
408 {
409         if (thread->blendsubtract)
410         {
411                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
412                 {
413                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
414                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
415                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
416                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
417                 }
418         }
419         else
420         {       
421                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
422                 {
423                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
424                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
425                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
426                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
427                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
428                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
429                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
430                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
431                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
432                 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
433                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
434                 }
435         }
436 }
437
438 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
439
440 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
441 {
442         mask &= thread->validate;
443         if (!mask)
444                 return;
445         if (mask & DPSOFTRAST_VALIDATE_FB)
446         {
447                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
448                 DPSOFTRAST_RecalcFB(thread);
449         }
450         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
451         {
452                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
453                 DPSOFTRAST_RecalcDepthFunc(thread);
454         }
455         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
456         {
457                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
458                 DPSOFTRAST_RecalcBlendFunc(thread);
459         }
460 }
461
462 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
463 {
464         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
465                 return &dpsoftrast.texture[index];
466         return NULL;
467 }
468
469 static void DPSOFTRAST_Texture_Grow(void)
470 {
471         DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
472         DPSOFTRAST_State_Thread *thread;
473         int i;
474         int j;
475         DPSOFTRAST_Flush();
476         // expand texture array as needed
477         if (dpsoftrast.texture_max < 1024)
478                 dpsoftrast.texture_max = 1024;
479         else
480                 dpsoftrast.texture_max *= 2;
481         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
482         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
483                 if (dpsoftrast.texbound[i])
484                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
485         for (j = 0; j < dpsoftrast.numthreads; j++)
486         {
487                 thread = &dpsoftrast.threads[j];
488                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
489                         if (thread->texbound[i])
490                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
491         }
492 }
493
494 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
495 {
496         int w;
497         int h;
498         int d;
499         int size;
500         int s;
501         int texnum;
502         int mipmaps;
503         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
504         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
505         DPSOFTRAST_Texture *texture;
506         if (width*height*depth < 1)
507         {
508                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
509                 return 0;
510         }
511         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
512         {
513                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
514                 return 0;
515         }
516         switch(texformat)
517         {
518         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
519         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
520         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
521                 break;
522         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
523                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
524                 {
525                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
526                         return 0;
527                 }
528                 if (depth != 1)
529                 {
530                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
531                         return 0;
532                 }
533                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
534                 {
535                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
536                         return 0;
537                 }
538                 break;
539         }
540         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
541         {
542                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
543                 return 0;
544         }
545         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
546         {
547                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
548                 return 0;
549         }
550         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
551         {
552                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
553                 return 0;
554         }
555         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
556         {
557                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
558                 return 0;
559         }
560         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
561         {
562                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
563                 return 0;
564         }
565         // find first empty slot in texture array
566         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
567                 if (!dpsoftrast.texture[texnum].bytes)
568                         break;
569         dpsoftrast.texture_firstfree = texnum + 1;
570         if (dpsoftrast.texture_max <= texnum)
571                 DPSOFTRAST_Texture_Grow();
572         if (dpsoftrast.texture_end <= texnum)
573                 dpsoftrast.texture_end = texnum + 1;
574         texture = &dpsoftrast.texture[texnum];
575         memset(texture, 0, sizeof(*texture));
576         texture->flags = flags;
577         texture->width = width;
578         texture->height = height;
579         texture->depth = depth;
580         texture->sides = sides;
581         texture->binds = 0;
582         w = width;
583         h = height;
584         d = depth;
585         size = 0;
586         mipmaps = 0;
587         w = width;
588         h = height;
589         d = depth;
590         for (;;)
591         {
592                 s = w * h * d * sides * 4;
593                 texture->mipmap[mipmaps][0] = size;
594                 texture->mipmap[mipmaps][1] = s;
595                 texture->mipmap[mipmaps][2] = w;
596                 texture->mipmap[mipmaps][3] = h;
597                 texture->mipmap[mipmaps][4] = d;
598                 size += s;
599                 mipmaps++;
600                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
601                         break;
602                 if (w > 1) w >>= 1;
603                 if (h > 1) h >>= 1;
604                 if (d > 1) d >>= 1;
605         }
606         texture->mipmaps = mipmaps;
607         texture->size = size;
608
609         // allocate the pixels now
610         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
611
612         return texnum;
613 }
614 void DPSOFTRAST_Texture_Free(int index)
615 {
616         DPSOFTRAST_Texture *texture;
617         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
618         if (texture->binds)
619                 DPSOFTRAST_Flush();
620         if (texture->bytes)
621                 MM_FREE(texture->bytes);
622         texture->bytes = NULL;
623         memset(texture, 0, sizeof(*texture));
624         // adjust the free range and used range
625         if (dpsoftrast.texture_firstfree > index)
626                 dpsoftrast.texture_firstfree = index;
627         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
628                 dpsoftrast.texture_end--;
629 }
630 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
631 {
632         int i, x, y, z, w, layer0, layer1, row0, row1;
633         unsigned char *o, *i0, *i1, *i2, *i3;
634         DPSOFTRAST_Texture *texture;
635         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
636         if (texture->mipmaps <= 1)
637                 return;
638         for (i = 1;i < texture->mipmaps;i++)
639         {
640                 for (z = 0;z < texture->mipmap[i][4];z++)
641                 {
642                         layer0 = z*2;
643                         layer1 = z*2+1;
644                         if (layer1 >= texture->mipmap[i-1][4])
645                                 layer1 = texture->mipmap[i-1][4]-1;
646                         for (y = 0;y < texture->mipmap[i][3];y++)
647                         {
648                                 row0 = y*2;
649                                 row1 = y*2+1;
650                                 if (row1 >= texture->mipmap[i-1][3])
651                                         row1 = texture->mipmap[i-1][3]-1;
652                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
653                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
654                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
655                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
656                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
657                                 w = texture->mipmap[i][2];
658                                 if (layer1 > layer0)
659                                 {
660                                         if (texture->mipmap[i-1][2] > 1)
661                                         {
662                                                 // average 3D texture
663                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
664                                                 {
665                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
666                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
667                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
668                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
669                                                 }
670                                         }
671                                         else
672                                         {
673                                                 // average 3D mipmap with parent width == 1
674                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
675                                                 {
676                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
677                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
678                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
679                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
680                                                 }
681                                         }
682                                 }
683                                 else
684                                 {
685                                         if (texture->mipmap[i-1][2] > 1)
686                                         {
687                                                 // average 2D texture (common case)
688                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
689                                                 {
690                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
691                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
692                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
693                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
694                                                 }
695                                         }
696                                         else
697                                         {
698                                                 // 2D texture with parent width == 1
699                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
700                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
701                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
702                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
703                                         }
704                                 }
705                         }
706                 }
707         }
708 }
709 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
710 {
711         DPSOFTRAST_Texture *texture;
712         unsigned char *dst;
713         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
714         if (texture->binds)
715                 DPSOFTRAST_Flush();
716         if (pixels)
717         {
718                 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
719                 while (blockheight > 0)
720                 {
721                         memcpy(dst, pixels, blockwidth * 4);
722                         pixels += blockwidth * 4;
723                         dst += texture->mipmap[0][2] * 4;
724                         blockheight--;
725                 }
726         }
727         DPSOFTRAST_Texture_CalculateMipmaps(index);
728 }
729 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
730 {
731         DPSOFTRAST_Texture *texture;
732         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
733         if (texture->binds)
734                 DPSOFTRAST_Flush();
735         if (pixels)
736                 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
737         DPSOFTRAST_Texture_CalculateMipmaps(index);
738 }
739 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
740 {
741         DPSOFTRAST_Texture *texture;
742         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
743         return texture->mipmap[mip][2];
744 }
745 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
746 {
747         DPSOFTRAST_Texture *texture;
748         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
749         return texture->mipmap[mip][3];
750 }
751 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
752 {
753         DPSOFTRAST_Texture *texture;
754         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
755         return texture->mipmap[mip][4];
756 }
757 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
758 {
759         DPSOFTRAST_Texture *texture;
760         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
761         if (texture->binds)
762                 DPSOFTRAST_Flush();
763         return texture->bytes + texture->mipmap[mip][0];
764 }
765 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
766 {
767         DPSOFTRAST_Texture *texture;
768         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
769         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
770         {
771                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
772                 return;
773         }
774         if (texture->binds)
775                 DPSOFTRAST_Flush();
776         texture->filter = filter;
777 }
778
779 static void DPSOFTRAST_Draw_FlushThreads(void);
780
781 static void DPSOFTRAST_Draw_SyncCommands(void)
782 {
783         if(dpsoftrast.usethreads) MEMORY_BARRIER;
784         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
785 }
786
787 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
788 {
789         DPSOFTRAST_State_Thread *thread;
790         int i;
791         int freecommand = dpsoftrast.commandpool.freecommand;
792         int usedcommands = dpsoftrast.commandpool.usedcommands;
793         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
794                 return;
795         DPSOFTRAST_Draw_SyncCommands();
796         for(;;)
797         {
798                 int waitindex = -1;
799                 int commandoffset;
800                 usedcommands = 0;
801                 for (i = 0; i < dpsoftrast.numthreads; i++)
802                 {
803                         thread = &dpsoftrast.threads[i]; 
804                         commandoffset = freecommand - thread->commandoffset;
805                         if (commandoffset < 0)
806                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
807                         if (commandoffset > usedcommands)
808                         {
809                                 waitindex = i;
810                                 usedcommands = commandoffset;
811                         }
812                 }
813                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
814                         break;
815                 thread = &dpsoftrast.threads[waitindex];
816                 Thread_LockMutex(thread->drawmutex);
817                 if (thread->commandoffset != dpsoftrast.drawcommand)
818                 {
819                         thread->waiting = true;
820                         if (thread->starving) Thread_CondSignal(thread->drawcond);
821                         Thread_CondWait(thread->waitcond, thread->drawmutex);
822                         thread->waiting = false;
823                 }
824                 Thread_UnlockMutex(thread->drawmutex);
825         }
826         dpsoftrast.commandpool.usedcommands = usedcommands;
827 }
828
829 #define DPSOFTRAST_ALIGNCOMMAND(size) \
830         ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
831 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
832         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
833
834 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
835 {
836         DPSOFTRAST_Command *command;
837         int freecommand = dpsoftrast.commandpool.freecommand;
838         int usedcommands = dpsoftrast.commandpool.usedcommands;
839         int extra = sizeof(DPSOFTRAST_Command);
840         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
841                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
842         if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
843         {
844                 if (dpsoftrast.usethreads)
845                         DPSOFTRAST_Draw_FreeCommandPool(size + extra);
846                 else
847                         DPSOFTRAST_Draw_FlushThreads();
848                 freecommand = dpsoftrast.commandpool.freecommand;
849                 usedcommands = dpsoftrast.commandpool.usedcommands;
850         }
851         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
852         {
853                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
854                 command->opcode = DPSOFTRAST_OPCODE_Reset;
855                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
856                 freecommand = 0;
857         }
858         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
859         command->opcode = opcode;
860         command->commandsize = size;
861         freecommand += size;
862         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
863                 freecommand = 0;
864         dpsoftrast.commandpool.freecommand = freecommand;
865         dpsoftrast.commandpool.usedcommands = usedcommands + size;
866         return command;
867 }
868
869 static void DPSOFTRAST_UndoCommand(int size)
870 {
871         int freecommand = dpsoftrast.commandpool.freecommand;
872         int usedcommands = dpsoftrast.commandpool.usedcommands;
873         freecommand -= size;
874         if (freecommand < 0)
875                 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
876         usedcommands -= size;
877         dpsoftrast.commandpool.freecommand = freecommand;
878         dpsoftrast.commandpool.usedcommands = usedcommands;
879 }
880                 
881 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
882 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
883 {
884         thread->viewport[0] = command->x;
885         thread->viewport[1] = command->y;
886         thread->viewport[2] = command->width;
887         thread->viewport[3] = command->height;
888         thread->validate |= DPSOFTRAST_VALIDATE_FB;
889 }
890 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
891 {
892         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
893         command->x = x;
894         command->y = y;
895         command->width = width;
896         command->height = height;
897
898         dpsoftrast.viewport[0] = x;
899         dpsoftrast.viewport[1] = y;
900         dpsoftrast.viewport[2] = width;
901         dpsoftrast.viewport[3] = height;
902         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
903 }
904
905 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
906 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
907 {
908         int i, x1, y1, x2, y2, w, h, x, y;
909         int miny1, maxy1, miny2, maxy2;
910         int bandy;
911         unsigned int *p;
912         unsigned int c;
913         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
914         miny1 = thread->miny1;
915         maxy1 = thread->maxy1;
916         miny2 = thread->miny2;
917         maxy2 = thread->maxy2;
918         x1 = thread->fb_scissor[0];
919         y1 = thread->fb_scissor[1];
920         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
921         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
922         if (y1 < miny1) y1 = miny1;
923         if (y2 > maxy2) y2 = maxy2;
924         w = x2 - x1;
925         h = y2 - y1;
926         if (w < 1 || h < 1)
927                 return;
928         // FIXME: honor fb_colormask?
929         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
930         for (i = 0;i < 4;i++)
931         {
932                 if (!dpsoftrast.fb_colorpixels[i])
933                         continue;
934                 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
935                 for (;y < bandy;y++)
936                 {
937                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
938                         for (x = x1;x < x2;x++)
939                                 p[x] = c;
940                 }
941         }
942 }
943 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
944 {
945         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
946         command->r = r;
947         command->g = g;
948         command->b = b;
949         command->a = a;
950 }
951
952 DEFCOMMAND(3, ClearDepth, float depth;)
953 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
954 {
955         int x1, y1, x2, y2, w, h, x, y;
956         int miny1, maxy1, miny2, maxy2;
957         int bandy;
958         unsigned int *p;
959         unsigned int c;
960         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
961         miny1 = thread->miny1;
962         maxy1 = thread->maxy1;
963         miny2 = thread->miny2;
964         maxy2 = thread->maxy2;
965         x1 = thread->fb_scissor[0];
966         y1 = thread->fb_scissor[1];
967         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
968         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
969         if (y1 < miny1) y1 = miny1;
970         if (y2 > maxy2) y2 = maxy2;
971         w = x2 - x1;
972         h = y2 - y1;
973         if (w < 1 || h < 1)
974                 return;
975         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
976         for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
977         for (;y < bandy;y++)
978         {
979                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
980                 for (x = x1;x < x2;x++)
981                         p[x] = c;
982         }
983 }
984 void DPSOFTRAST_ClearDepth(float d)
985 {
986         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
987         command->depth = d;
988 }
989
990 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
991 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
992 {
993         thread->colormask[0] = command->r != 0;
994         thread->colormask[1] = command->g != 0;
995         thread->colormask[2] = command->b != 0;
996         thread->colormask[3] = command->a != 0;
997         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
998 }
999 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1000 {
1001         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1002         command->r = r;
1003         command->g = g;
1004         command->b = b;
1005         command->a = a;
1006 }
1007
1008 DEFCOMMAND(5, DepthTest, int enable;)
1009 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1010 {
1011         thread->depthtest = command->enable;
1012         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1013 }
1014 void DPSOFTRAST_DepthTest(int enable)
1015 {
1016         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1017         command->enable = enable;
1018 }
1019
1020 DEFCOMMAND(6, ScissorTest, int enable;)
1021 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1022 {
1023         thread->scissortest = command->enable;
1024         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1025 }
1026 void DPSOFTRAST_ScissorTest(int enable)
1027 {
1028         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1029         command->enable = enable;
1030 }
1031
1032 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1033 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1034 {
1035         thread->scissor[0] = command->x;
1036         thread->scissor[1] = command->y;
1037         thread->scissor[2] = command->width;
1038         thread->scissor[3] = command->height;
1039         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1040 }
1041 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1042 {
1043         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1044         command->x = x;
1045         command->y = y;
1046         command->width = width;
1047         command->height = height;
1048 }
1049
1050 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1051 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1052 {
1053         thread->blendfunc[0] = command->sfactor;
1054         thread->blendfunc[1] = command->dfactor;
1055         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1056 }
1057 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1058 {
1059         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1060         command->sfactor = sfactor;
1061         command->dfactor = dfactor;
1062 }
1063
1064 DEFCOMMAND(9, BlendSubtract, int enable;)
1065 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1066 {
1067         thread->blendsubtract = command->enable;
1068         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1069 }
1070 void DPSOFTRAST_BlendSubtract(int enable)
1071 {
1072         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1073         command->enable = enable;
1074 }
1075
1076 DEFCOMMAND(10, DepthMask, int enable;)
1077 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1078 {
1079         thread->depthmask = command->enable;
1080 }
1081 void DPSOFTRAST_DepthMask(int enable)
1082 {
1083         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1084         command->enable = enable;
1085 }
1086
1087 DEFCOMMAND(11, DepthFunc, int func;)
1088 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1089 {
1090         thread->depthfunc = command->func;
1091 }
1092 void DPSOFTRAST_DepthFunc(int func)
1093 {
1094         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1095         command->func = func;
1096 }
1097
1098 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1099 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1100 {
1101         thread->depthrange[0] = command->nearval;
1102         thread->depthrange[1] = command->farval;
1103 }
1104 void DPSOFTRAST_DepthRange(float nearval, float farval)
1105 {
1106         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1107         command->nearval = nearval;
1108         command->farval = farval;
1109 }
1110
1111 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1112 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1113 {
1114         thread->polygonoffset[0] = command->alongnormal;
1115         thread->polygonoffset[1] = command->intoview;
1116 }
1117 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1118 {
1119         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1120         command->alongnormal = alongnormal;
1121         command->intoview = intoview;
1122 }
1123
1124 DEFCOMMAND(14, CullFace, int mode;)
1125 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1126 {
1127         thread->cullface = command->mode;
1128 }
1129 void DPSOFTRAST_CullFace(int mode)
1130 {
1131         DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1132         command->mode = mode;
1133 }
1134
1135 DEFCOMMAND(15, AlphaTest, int enable;)
1136 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1137 {
1138         thread->alphatest = command->enable;
1139 }
1140 void DPSOFTRAST_AlphaTest(int enable)
1141 {
1142         DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1143         command->enable = enable;
1144 }
1145
1146 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1147 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1148 {
1149         thread->alphafunc = command->func;
1150         thread->alphavalue = command->ref;
1151 }
1152 void DPSOFTRAST_AlphaFunc(int func, float ref)
1153 {
1154         DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1155         command->func = func;
1156         command->ref = ref;
1157 }
1158
1159 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1160 {
1161         dpsoftrast.color[0] = r;
1162         dpsoftrast.color[1] = g;
1163         dpsoftrast.color[2] = b;
1164         dpsoftrast.color[3] = a;
1165 }
1166
1167 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1168 {
1169         int outstride = blockwidth * 4;
1170         int instride = dpsoftrast.fb_width * 4;
1171         int bx1 = blockx;
1172         int by1 = blocky;
1173         int bx2 = blockx + blockwidth;
1174         int by2 = blocky + blockheight;
1175         int bw;
1176         int x;
1177         int y;
1178         unsigned char *inpixels;
1179         unsigned char *b;
1180         unsigned char *o;
1181         DPSOFTRAST_Flush();
1182         if (bx1 < 0) bx1 = 0;
1183         if (by1 < 0) by1 = 0;
1184         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1185         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1186         bw = bx2 - bx1;
1187         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1188         if (dpsoftrast.bigendian)
1189         {
1190                 for (y = by1;y < by2;y++)
1191                 {
1192                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1193                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1194                         for (x = bx1;x < bx2;x++)
1195                         {
1196                                 o[0] = b[3];
1197                                 o[1] = b[2];
1198                                 o[2] = b[1];
1199                                 o[3] = b[0];
1200                                 o += 4;
1201                                 b += 4;
1202                         }
1203                 }
1204         }
1205         else
1206         {
1207                 for (y = by1;y < by2;y++)
1208                 {
1209                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1210                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1211                         memcpy(o, b, bw*4);
1212                 }
1213         }
1214
1215 }
1216 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1217 {
1218         int tx1 = tx;
1219         int ty1 = ty;
1220         int tx2 = tx + width;
1221         int ty2 = ty + height;
1222         int sx1 = sx;
1223         int sy1 = sy;
1224         int sx2 = sx + width;
1225         int sy2 = sy + height;
1226         int swidth;
1227         int sheight;
1228         int twidth;
1229         int theight;
1230         int sw;
1231         int sh;
1232         int tw;
1233         int th;
1234         int y;
1235         unsigned int *spixels;
1236         unsigned int *tpixels;
1237         DPSOFTRAST_Texture *texture;
1238         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1239         if (mip < 0 || mip >= texture->mipmaps) return;
1240         DPSOFTRAST_Flush();
1241         spixels = dpsoftrast.fb_colorpixels[0];
1242         swidth = dpsoftrast.fb_width;
1243         sheight = dpsoftrast.fb_height;
1244         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1245         twidth = texture->mipmap[mip][2];
1246         theight = texture->mipmap[mip][3];
1247         if (tx1 < 0) tx1 = 0;
1248         if (ty1 < 0) ty1 = 0;
1249         if (tx2 > twidth) tx2 = twidth;
1250         if (ty2 > theight) ty2 = theight;
1251         if (sx1 < 0) sx1 = 0;
1252         if (sy1 < 0) sy1 = 0;
1253         if (sx2 > swidth) sx2 = swidth;
1254         if (sy2 > sheight) sy2 = sheight;
1255         tw = tx2 - tx1;
1256         th = ty2 - ty1;
1257         sw = sx2 - sx1;
1258         sh = sy2 - sy1;
1259         if (tw > sw) tw = sw;
1260         if (th > sh) th = sh;
1261         if (tw < 1 || th < 1)
1262                 return;
1263         sy1 = sheight - 1 - sy1;
1264         for (y = 0;y < th;y++)
1265                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1266         if (texture->mipmaps > 1)
1267                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1268 }
1269
1270 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1271 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1272 {
1273         if (thread->texbound[command->unitnum])
1274                 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1275         thread->texbound[command->unitnum] = command->texture;
1276 }
1277 void DPSOFTRAST_SetTexture(int unitnum, int index)
1278 {
1279         DPSOFTRAST_Command_SetTexture *command;
1280         DPSOFTRAST_Texture *texture;
1281         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1282         {
1283                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1284                 return;
1285         }
1286         texture = DPSOFTRAST_Texture_GetByIndex(index);
1287         if (index && !texture)
1288         {
1289                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1290                 return;
1291         }
1292
1293         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1294         command->unitnum = unitnum;
1295         command->texture = texture;
1296
1297         dpsoftrast.texbound[unitnum] = texture;
1298         ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1299 }
1300
1301 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1302 {
1303         dpsoftrast.pointer_vertex3f = vertex3f;
1304         dpsoftrast.stride_vertex = stride;
1305 }
1306 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1307 {
1308         dpsoftrast.pointer_color4f = color4f;
1309         dpsoftrast.pointer_color4ub = NULL;
1310         dpsoftrast.stride_color = stride;
1311 }
1312 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1313 {
1314         dpsoftrast.pointer_color4f = NULL;
1315         dpsoftrast.pointer_color4ub = color4ub;
1316         dpsoftrast.stride_color = stride;
1317 }
1318 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1319 {
1320         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1321         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1322         dpsoftrast.stride_texcoord[unitnum] = stride;
1323 }
1324
1325 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1326 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1327 {
1328         thread->shader_mode = command->mode;
1329         thread->shader_permutation = command->permutation;
1330         thread->shader_exactspecularmath = command->exactspecularmath;
1331 }
1332 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1333 {
1334         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1335         command->mode = mode;
1336         command->permutation = permutation;
1337         command->exactspecularmath = exactspecularmath;
1338
1339         dpsoftrast.shader_mode = mode;
1340         dpsoftrast.shader_permutation = permutation;
1341         dpsoftrast.shader_exactspecularmath = exactspecularmath;
1342 }
1343
1344 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1345 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1346 {
1347         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1348 }
1349 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1350 {
1351         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1352         command->index = index;
1353         command->val[0] = v0;
1354         command->val[1] = v1;
1355         command->val[2] = v2;
1356         command->val[3] = v3;
1357
1358         dpsoftrast.uniform4f[index*4+0] = v0;
1359         dpsoftrast.uniform4f[index*4+1] = v1;
1360         dpsoftrast.uniform4f[index*4+2] = v2;
1361         dpsoftrast.uniform4f[index*4+3] = v3;
1362 }
1363 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1364 {
1365         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1366         command->index = index;
1367         memcpy(command->val, v, sizeof(command->val));
1368
1369         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1370 }
1371
1372 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1373 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1374 {
1375         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1376 }
1377 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1378 {
1379 #ifdef SSE_POSSIBLE
1380         int i, index;
1381         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1382         {
1383                 __m128 m0, m1, m2, m3;
1384                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1385                 command->index = (DPSOFTRAST_UNIFORM)index;
1386                 if (((size_t)v)&(ALIGN_SIZE-1))
1387                 {
1388                         m0 = _mm_loadu_ps(v);
1389                         m1 = _mm_loadu_ps(v+4);
1390                         m2 = _mm_loadu_ps(v+8);
1391                         m3 = _mm_loadu_ps(v+12);
1392                 }
1393                 else
1394                 {
1395                         m0 = _mm_load_ps(v);
1396                         m1 = _mm_load_ps(v+4);
1397                         m2 = _mm_load_ps(v+8);
1398                         m3 = _mm_load_ps(v+12);
1399                 }
1400                 if (transpose)
1401                 {
1402                         __m128 t0, t1, t2, t3;
1403                         t0 = _mm_unpacklo_ps(m0, m1);
1404                         t1 = _mm_unpacklo_ps(m2, m3);
1405                         t2 = _mm_unpackhi_ps(m0, m1);
1406                         t3 = _mm_unpackhi_ps(m2, m3);
1407                         m0 = _mm_movelh_ps(t0, t1);
1408                         m1 = _mm_movehl_ps(t1, t0);
1409                         m2 = _mm_movelh_ps(t2, t3);
1410                         m3 = _mm_movehl_ps(t3, t2);                     
1411                 }
1412                 _mm_store_ps(command->val, m0);
1413                 _mm_store_ps(command->val+4, m1);
1414                 _mm_store_ps(command->val+8, m2);
1415                 _mm_store_ps(command->val+12, m3);
1416                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1417                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1418                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1419                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1420         }
1421 #endif
1422 }
1423
1424 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1425 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1426 {
1427         thread->uniform1i[command->index] = command->val;
1428 }
1429 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1430 {
1431         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1432         command->index = index;
1433         command->val = i0;
1434
1435         dpsoftrast.uniform1i[command->index] = i0;
1436 }
1437
1438 #ifdef SSE_POSSIBLE
1439 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1440 {
1441         float *end = dst + size*4;
1442         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1443         {
1444                 while (dst < end)
1445                 {
1446                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1447                         dst += 4;
1448                         src += stride;
1449                 }
1450         }
1451         else
1452         {
1453                 while (dst < end)
1454                 {
1455                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1456                         dst += 4;
1457                         src += stride;
1458                 }
1459         }
1460 }
1461
1462 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1463 {
1464         float *end = dst + size*4;
1465         if (stride == sizeof(float[3]))
1466         {
1467                 float *end4 = dst + (size&~3)*4;        
1468                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1469                 {
1470                         while (dst < end4)
1471                         {
1472                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1473                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1474                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1475                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1476                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1477                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1478                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1479                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1480                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1481                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1482                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1483                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1484                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1485                                 dst += 16;
1486                                 src += 4*sizeof(float[3]);
1487                         }
1488                 }
1489                 else
1490                 {
1491                         while (dst < end4)
1492                         {
1493                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1494                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1495                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1496                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1497                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1498                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1499                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1500                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1501                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1502                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1503                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1504                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1505                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1506                                 dst += 16;
1507                                 src += 4*sizeof(float[3]);
1508                         }
1509                 }
1510         }
1511         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1512         {
1513                 while (dst < end)
1514                 {
1515                         __m128 v = _mm_loadu_ps((const float *)src);
1516                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1517                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1518                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1519                         _mm_store_ps(dst, v);
1520                         dst += 4;
1521                         src += stride;
1522                 }
1523         }
1524         else
1525         {
1526                 while (dst < end)
1527                 {
1528                         __m128 v = _mm_load_ps((const float *)src);
1529                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1530                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1531                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1532                         _mm_store_ps(dst, v);
1533                         dst += 4;
1534                         src += stride;
1535                 }
1536         }
1537 }
1538
1539 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1540 {
1541         float *end = dst + size*4;
1542         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1543         if (stride == sizeof(float[2]))
1544         {
1545                 float *end2 = dst + (size&~1)*4;
1546                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1547                 {
1548                         while (dst < end2)
1549                         {
1550                                 __m128 v = _mm_loadu_ps((const float *)src);
1551                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1552                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1553                                 dst += 8;
1554                                 src += 2*sizeof(float[2]);
1555                         }
1556                 }
1557                 else
1558                 {
1559                         while (dst < end2)
1560                         {
1561                                 __m128 v = _mm_load_ps((const float *)src);
1562                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1563                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1564                                 dst += 8;
1565                                 src += 2*sizeof(float[2]);
1566                         }
1567                 }
1568         }
1569         while (dst < end)
1570         {
1571                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1572                 dst += 4;
1573                 src += stride;
1574         }
1575 }
1576
1577 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1578 {
1579         float *end = dst + size*4;
1580         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1581         if (stride == sizeof(unsigned char[4]))
1582         {
1583                 float *end4 = dst + (size&~3)*4;
1584                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1585                 {
1586                         while (dst < end4)
1587                         {
1588                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1589                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1590                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1591                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1592                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1593                                 dst += 16;
1594                                 src += 4*sizeof(unsigned char[4]);
1595                         }
1596                 }
1597                 else
1598                 {
1599                         while (dst < end4)
1600                         {
1601                                 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1602                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1603                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1604                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1605                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1606                                 dst += 16;
1607                                 src += 4*sizeof(unsigned char[4]);
1608                         }
1609                 }
1610         }
1611         while (dst < end)
1612         {
1613                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1614                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1615                 dst += 4;
1616                 src += stride;
1617         }
1618 }
1619
1620 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1621 {
1622         float *end = dst + 4*size;
1623         __m128 v = _mm_loadu_ps(src);
1624         while (dst < end)
1625         {
1626                 _mm_store_ps(dst, v);
1627                 dst += 4;
1628         }
1629 }
1630 #endif
1631
1632 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1633 {
1634 #ifdef SSE_POSSIBLE
1635         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1636         __m128 m0, m1, m2, m3;
1637         float *end;
1638         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1639         {
1640                 // fast case for identity matrix
1641                 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1642                 return;
1643         }
1644         end = out4f + numitems*4;
1645         m0 = _mm_loadu_ps(inmatrix16f);
1646         m1 = _mm_loadu_ps(inmatrix16f + 4);
1647         m2 = _mm_loadu_ps(inmatrix16f + 8);
1648         m3 = _mm_loadu_ps(inmatrix16f + 12);
1649         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1650         {
1651                 while (out4f < end)
1652                 {
1653                         __m128 v = _mm_loadu_ps(in4f);
1654                         _mm_store_ps(out4f,
1655                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1656                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1657                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1658                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1659                         out4f += 4;
1660                         in4f += 4;
1661                 }
1662         }
1663         else
1664         {
1665                 while (out4f < end)
1666                 {
1667                         __m128 v = _mm_load_ps(in4f);
1668                         _mm_store_ps(out4f,
1669                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1670                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1671                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1672                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1673                         out4f += 4;
1674                         in4f += 4;
1675                 }
1676         }
1677 #endif
1678 }
1679
1680 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1681 {
1682         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1683 }
1684
1685 #ifdef SSE_POSSIBLE
1686 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1687 { \
1688         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1689         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1690         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1691         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1692 }
1693
1694 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1695 { \
1696         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1697         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1698         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1699         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1700 }
1701
1702 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1703 { \
1704         __m128 p = (in); \
1705         out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1706                                                   _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1707                                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1708                                                                                         _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1709 }
1710
1711 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1712 {
1713         int clipmask = 0xFF;
1714         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1715         __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1716         __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1717         __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1718         m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1719         m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1720         m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1721         m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1722         #define BBFRONT(k, pos) \
1723         { \
1724                 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1725                 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1726                 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1727                 { \
1728                         __m128 proj; \
1729                         clipmask &= ~(1<<k); \
1730                         proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1731                         minproj = _mm_min_ss(minproj, proj); \
1732                         maxproj = _mm_max_ss(maxproj, proj); \
1733                 } \
1734         }
1735         BBFRONT(0, minpos); 
1736         BBFRONT(1, _mm_move_ss(minpos, maxpos)); 
1737         BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1738         BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1739         BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1740         BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1741         BBFRONT(6, _mm_move_ss(maxpos, minpos)); 
1742         BBFRONT(7, maxpos);
1743         #define BBCLIP(k) \
1744         { \
1745                 if (clipmask&(1<<k)) \
1746                 { \
1747                         if (!(clipmask&(1<<(k^1)))) \
1748                         { \
1749                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1750                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1751                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1752                                 minproj = _mm_min_ss(minproj, proj); \
1753                                 maxproj = _mm_max_ss(maxproj, proj); \
1754                         } \
1755                         if (!(clipmask&(1<<(k^2)))) \
1756                         { \
1757                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1758                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1759                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1760                                 minproj = _mm_min_ss(minproj, proj); \
1761                                 maxproj = _mm_max_ss(maxproj, proj); \
1762                         } \
1763                         if (!(clipmask&(1<<(k^4)))) \
1764                         { \
1765                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1766                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1767                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1768                                 minproj = _mm_min_ss(minproj, proj); \
1769                                 maxproj = _mm_max_ss(maxproj, proj); \
1770                         } \
1771                 } \
1772         }
1773         BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1774         viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1775         viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1776         minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1777         maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1778         minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1779         maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1780         *starty = _mm_cvttss_si32(maxproj);
1781         *endy = _mm_cvttss_si32(minproj)+1;
1782         return clipmask;
1783 }
1784         
1785 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1786 {
1787         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1788         float *end = out4f + numitems*4;
1789         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1790         __m128 minpos, maxpos;
1791         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1792         {
1793                 minpos = maxpos = _mm_loadu_ps(in4f);
1794                 while (out4f < end)
1795                 {
1796                         __m128 v = _mm_loadu_ps(in4f);
1797                         minpos = _mm_min_ps(minpos, v);
1798                         maxpos = _mm_max_ps(maxpos, v);
1799                         _mm_store_ps(out4f, v);
1800                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1801                         _mm_store_ps(screen4f, v);
1802                         in4f += 4;
1803                         out4f += 4;
1804                         screen4f += 4;
1805                 }
1806         }
1807         else
1808         {
1809                 minpos = maxpos = _mm_load_ps(in4f);
1810                 while (out4f < end)
1811                 {
1812                         __m128 v = _mm_load_ps(in4f);
1813                         minpos = _mm_min_ps(minpos, v);
1814                         maxpos = _mm_max_ps(maxpos, v);
1815                         _mm_store_ps(out4f, v);
1816                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1817                         _mm_store_ps(screen4f, v);
1818                         in4f += 4;
1819                         out4f += 4;
1820                         screen4f += 4;
1821                 }
1822         }
1823         if (starty && endy) 
1824         {
1825                 ALIGN(float minposf[4]);
1826                 ALIGN(float maxposf[4]);
1827                 _mm_store_ps(minposf, minpos);
1828                 _mm_store_ps(maxposf, maxpos);
1829                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1830         }
1831         return 0;
1832 }
1833
1834 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1835 {
1836         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1837         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1838         float *end;
1839         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1840                 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1841         end = out4f + numitems*4;
1842         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1843         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1844         m0 = _mm_loadu_ps(inmatrix16f);
1845         m1 = _mm_loadu_ps(inmatrix16f + 4);
1846         m2 = _mm_loadu_ps(inmatrix16f + 8);
1847         m3 = _mm_loadu_ps(inmatrix16f + 12);
1848         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1849         {
1850                 minpos = maxpos = _mm_loadu_ps(in4f);
1851                 while (out4f < end)
1852                 {
1853                         __m128 v = _mm_loadu_ps(in4f);
1854                         minpos = _mm_min_ps(minpos, v);
1855                         maxpos = _mm_max_ps(maxpos, v);
1856                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1857                         _mm_store_ps(out4f, v);
1858                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1859                         _mm_store_ps(screen4f, v);
1860                         in4f += 4;
1861                         out4f += 4;
1862                         screen4f += 4;
1863                 }
1864         }
1865         else
1866         {
1867                 minpos = maxpos = _mm_load_ps(in4f);
1868                 while (out4f < end)
1869                 {
1870                         __m128 v = _mm_load_ps(in4f);
1871                         minpos = _mm_min_ps(minpos, v);
1872                         maxpos = _mm_max_ps(maxpos, v);
1873                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1874                         _mm_store_ps(out4f, v);
1875                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1876                         _mm_store_ps(screen4f, v);
1877                         in4f += 4;
1878                         out4f += 4;
1879                         screen4f += 4;
1880                 }
1881         }
1882         if (starty && endy) 
1883         {
1884                 ALIGN(float minposf[4]);
1885                 ALIGN(float maxposf[4]);
1886                 _mm_store_ps(minposf, minpos);
1887                 _mm_store_ps(maxposf, maxpos);
1888                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f); 
1889         }
1890         return 0;
1891 }
1892 #endif
1893
1894 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1895 {
1896 #ifdef SSE_POSSIBLE
1897         float *outf = dpsoftrast.post_array4f[outarray];
1898         const unsigned char *inb;
1899         int firstvertex = dpsoftrast.firstvertex;
1900         int numvertices = dpsoftrast.numvertices;
1901         int stride;
1902         switch(inarray)
1903         {
1904         case DPSOFTRAST_ARRAY_POSITION:
1905                 stride = dpsoftrast.stride_vertex;
1906                 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1907                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1908                 break;
1909         case DPSOFTRAST_ARRAY_COLOR:
1910                 stride = dpsoftrast.stride_color;
1911                 if (dpsoftrast.pointer_color4f)
1912                 {
1913                         inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1914                         DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1915                 }
1916                 else if (dpsoftrast.pointer_color4ub)
1917                 {
1918                         stride = dpsoftrast.stride_color;
1919                         inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1920                         DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1921                 }
1922                 else
1923                 {
1924                         DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1925                 }
1926                 break;
1927         default:
1928                 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1929                 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1930                 {
1931                         inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1932                         switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1933                         {
1934                         case 2:
1935                                 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1936                                 break;
1937                         case 3:
1938                                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1939                                 break;
1940                         case 4:
1941                                 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1942                                 break;
1943                         }
1944                 }
1945                 break;
1946         }
1947         return outf;
1948 #else
1949         return NULL;
1950 #endif
1951 }
1952
1953 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1954 {
1955         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1956         DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1957         return data;
1958 }
1959
1960 #if 0
1961 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1962 {
1963 #ifdef SSE_POSSIBLE
1964         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1965         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1966         return data;
1967 #else
1968         return NULL;
1969 #endif
1970 }
1971 #endif
1972
1973 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1974 {
1975 #ifdef SSE_POSSIBLE
1976         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1977         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
1978         return data;
1979 #else
1980         return NULL;
1981 #endif
1982 }
1983
1984 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1985 {
1986         int x;
1987         int startx = span->startx;
1988         int endx = span->endx;
1989         float wslope = triangle->w[0];
1990         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1991         float endz = 1.0f / (w + wslope * startx);
1992         for (x = startx;x < endx;)
1993         {
1994                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1995                 float z = endz, dz;
1996                 if (nextsub >= endx) nextsub = endsub = endx-1;
1997                 endz = 1.0f / (w + wslope * nextsub);
1998                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1999                 for (; x <= endsub; x++, z += dz)
2000                         zf[x] = z;
2001         }
2002 }
2003
2004 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
2005 {
2006         int x;
2007         int startx = span->startx;
2008         int endx = span->endx;
2009         int d[4];
2010         float a, b;
2011         unsigned char * RESTRICT pixelmask = span->pixelmask;
2012         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2013         if (!pixel)
2014                 return;
2015         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2016         // handle alphatest now (this affects depth writes too)
2017         if (thread->alphatest)
2018                 for (x = startx;x < endx;x++)
2019                         if (in4f[x*4+3] < 0.5f)
2020                                 pixelmask[x] = false;
2021         // FIXME: this does not handle bigendian
2022         switch(thread->fb_blendmode)
2023         {
2024         case DPSOFTRAST_BLENDMODE_OPAQUE:
2025                 for (x = startx;x < endx;x++)
2026                 {
2027                         if (!pixelmask[x])
2028                                 continue;
2029                         d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
2030                         d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
2031                         d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
2032                         d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
2033                         pixel[x*4+0] = d[0];
2034                         pixel[x*4+1] = d[1];
2035                         pixel[x*4+2] = d[2];
2036                         pixel[x*4+3] = d[3];
2037                 }
2038                 break;
2039         case DPSOFTRAST_BLENDMODE_ALPHA:
2040                 for (x = startx;x < endx;x++)
2041                 {
2042                         if (!pixelmask[x])
2043                                 continue;
2044                         a = in4f[x*4+3] * 255.0f;
2045                         b = 1.0f - in4f[x*4+3];
2046                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2047                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2048                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2049                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2050                         pixel[x*4+0] = d[0];
2051                         pixel[x*4+1] = d[1];
2052                         pixel[x*4+2] = d[2];
2053                         pixel[x*4+3] = d[3];
2054                 }
2055                 break;
2056         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2057                 for (x = startx;x < endx;x++)
2058                 {
2059                         if (!pixelmask[x])
2060                                 continue;
2061                         a = in4f[x*4+3] * 255.0f;
2062                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2063                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2064                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2065                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2066                         pixel[x*4+0] = d[0];
2067                         pixel[x*4+1] = d[1];
2068                         pixel[x*4+2] = d[2];
2069                         pixel[x*4+3] = d[3];
2070                 }
2071                 break;
2072         case DPSOFTRAST_BLENDMODE_ADD:
2073                 for (x = startx;x < endx;x++)
2074                 {
2075                         if (!pixelmask[x])
2076                                 continue;
2077                         d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2078                         d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2079                         d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2080                         d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2081                         pixel[x*4+0] = d[0];
2082                         pixel[x*4+1] = d[1];
2083                         pixel[x*4+2] = d[2];
2084                         pixel[x*4+3] = d[3];
2085                 }
2086                 break;
2087         case DPSOFTRAST_BLENDMODE_INVMOD:
2088                 for (x = startx;x < endx;x++)
2089                 {
2090                         if (!pixelmask[x])
2091                                 continue;
2092                         d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2093                         d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2094                         d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2095                         d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2096                         pixel[x*4+0] = d[0];
2097                         pixel[x*4+1] = d[1];
2098                         pixel[x*4+2] = d[2];
2099                         pixel[x*4+3] = d[3];
2100                 }
2101                 break;
2102         case DPSOFTRAST_BLENDMODE_MUL:
2103                 for (x = startx;x < endx;x++)
2104                 {
2105                         if (!pixelmask[x])
2106                                 continue;
2107                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2108                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2109                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2110                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2111                         pixel[x*4+0] = d[0];
2112                         pixel[x*4+1] = d[1];
2113                         pixel[x*4+2] = d[2];
2114                         pixel[x*4+3] = d[3];
2115                 }
2116                 break;
2117         case DPSOFTRAST_BLENDMODE_MUL2:
2118                 for (x = startx;x < endx;x++)
2119                 {
2120                         if (!pixelmask[x])
2121                                 continue;
2122                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2123                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2124                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2125                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2126                         pixel[x*4+0] = d[0];
2127                         pixel[x*4+1] = d[1];
2128                         pixel[x*4+2] = d[2];
2129                         pixel[x*4+3] = d[3];
2130                 }
2131                 break;
2132         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2133                 for (x = startx;x < endx;x++)
2134                 {
2135                         if (!pixelmask[x])
2136                                 continue;
2137                         a = in4f[x*4+3] * -255.0f;
2138                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2139                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2140                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2141                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2142                         pixel[x*4+0] = d[0];
2143                         pixel[x*4+1] = d[1];
2144                         pixel[x*4+2] = d[2];
2145                         pixel[x*4+3] = d[3];
2146                 }
2147                 break;
2148         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2149                 for (x = startx;x < endx;x++)
2150                 {
2151                         if (!pixelmask[x])
2152                                 continue;
2153                         a = 255.0f;
2154                         b = 1.0f - in4f[x*4+3];
2155                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2156                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2157                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2158                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2159                         pixel[x*4+0] = d[0];
2160                         pixel[x*4+1] = d[1];
2161                         pixel[x*4+2] = d[2];
2162                         pixel[x*4+3] = d[3];
2163                 }
2164                 break;
2165         case DPSOFTRAST_BLENDMODE_INVADD:
2166                 for (x = startx;x < endx;x++)
2167                 {
2168                         if (!pixelmask[x])
2169                                 continue;
2170                         d[0] = (int)((255.0f-pixel[x*4+2])*in4f[x*4+0] + pixel[x*4+2]);if (d[0] > 255) d[0] = 255;
2171                         d[1] = (int)((255.0f-pixel[x*4+1])*in4f[x*4+1] + pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2172                         d[2] = (int)((255.0f-pixel[x*4+0])*in4f[x*4+2] + pixel[x*4+0]);if (d[2] > 255) d[2] = 255;
2173                         d[3] = (int)((255.0f-pixel[x*4+3])*in4f[x*4+3] + pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2174                         pixel[x*4+0] = d[0];
2175                         pixel[x*4+1] = d[1];
2176                         pixel[x*4+2] = d[2];
2177                         pixel[x*4+3] = d[3];
2178                 }
2179                 break;
2180         }
2181 }
2182
2183 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2184 {
2185 #ifdef SSE_POSSIBLE
2186         int x;
2187         int startx = span->startx;
2188         int endx = span->endx;
2189         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2190         unsigned char * RESTRICT pixelmask = span->pixelmask;
2191         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2192         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2193         if (!pixel)
2194                 return;
2195         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2196         pixeli += span->y * dpsoftrast.fb_width + span->x;
2197         // handle alphatest now (this affects depth writes too)
2198         if (thread->alphatest)
2199                 for (x = startx;x < endx;x++)
2200                         if (in4ub[x*4+3] < 0.5f)
2201                                 pixelmask[x] = false;
2202         // FIXME: this does not handle bigendian
2203         switch(thread->fb_blendmode)
2204         {
2205         case DPSOFTRAST_BLENDMODE_OPAQUE:
2206                 for (x = startx;x + 4 <= endx;)
2207                 {
2208                         if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2209                         {
2210                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2211                                 x += 4;
2212                         }
2213                         else
2214                         {
2215                                 if (pixelmask[x])
2216                                         pixeli[x] = ini[x];
2217                                 x++;
2218                         }
2219                 }
2220                 for (;x < endx;x++)
2221                         if (pixelmask[x])
2222                                 pixeli[x] = ini[x];
2223                 break;
2224         case DPSOFTRAST_BLENDMODE_ALPHA:
2225         #define FINISHBLEND(blend2, blend1) \
2226                 for (x = startx;x + 1 < endx;x += 2) \
2227                 { \
2228                         __m128i src, dst; \
2229                         switch (*(const unsigned short*)&pixelmask[x]) \
2230                         { \
2231                         case 0x0101: \
2232                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2233                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2234                                 blend2; \
2235                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2236                                 continue; \
2237                         case 0x0100: \
2238                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2239                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2240                                 blend1; \
2241                                 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst));  \
2242                                 continue; \
2243                         case 0x0001: \
2244                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2245                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2246                                 blend1; \
2247                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2248                                 continue; \
2249                         } \
2250                         break; \
2251                 } \
2252                 for(;x < endx; x++) \
2253                 { \
2254                         __m128i src, dst; \
2255                         if (!pixelmask[x]) \
2256                                 continue; \
2257                         src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2258                         dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2259                         blend1; \
2260                         pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2261                 }
2262
2263                 FINISHBLEND({
2264                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2265                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2266                 }, {
2267                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2268                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2269                 });
2270                 break;
2271         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2272                 FINISHBLEND({
2273                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2274                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2275                 }, {
2276                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2277                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2278                 });
2279                 break;
2280         case DPSOFTRAST_BLENDMODE_ADD:
2281                 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2282                 break;
2283         case DPSOFTRAST_BLENDMODE_INVMOD:
2284                 FINISHBLEND({
2285                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2286                 }, {
2287                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2288                 });
2289                 break;
2290         case DPSOFTRAST_BLENDMODE_MUL:
2291                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2292                 break;
2293         case DPSOFTRAST_BLENDMODE_MUL2:
2294                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2295                 break;
2296         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2297                 FINISHBLEND({
2298                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2299                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2300                 }, {
2301                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2302                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2303                 });
2304                 break;
2305         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2306                 FINISHBLEND({
2307                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2308                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2309                 }, {
2310                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2311                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2312                 });
2313                 break;
2314         case DPSOFTRAST_BLENDMODE_INVADD:
2315                 FINISHBLEND({
2316                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2317                 }, {
2318                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2319                 });
2320                 break;
2321         }
2322 #endif
2323 }
2324
2325 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2326 {
2327         int x;
2328         int startx = span->startx;
2329         int endx = span->endx;
2330         int flags;
2331         float c[4];
2332         float data[4];
2333         float slope[4];
2334         float tc[2], endtc[2];
2335         float tcscale[2];
2336         unsigned int tci[2];
2337         unsigned int tci1[2];
2338         unsigned int tcimin[2];
2339         unsigned int tcimax[2];
2340         int tciwrapmask[2];
2341         int tciwidth;
2342         int filter;
2343         int mip;
2344         const unsigned char * RESTRICT pixelbase;
2345         const unsigned char * RESTRICT pixel[4];
2346         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2347         // if no texture is bound, just fill it with white
2348         if (!texture)
2349         {
2350                 for (x = startx;x < endx;x++)
2351                 {
2352                         out4f[x*4+0] = 1.0f;
2353                         out4f[x*4+1] = 1.0f;
2354                         out4f[x*4+2] = 1.0f;
2355                         out4f[x*4+3] = 1.0f;
2356                 }
2357                 return;
2358         }
2359         mip = triangle->mip[texunitindex];
2360         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2361         // if this mipmap of the texture is 1 pixel, just fill it with that color
2362         if (texture->mipmap[mip][1] == 4)
2363         {
2364                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2365                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2366                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2367                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2368                 for (x = startx;x < endx;x++)
2369                 {
2370                         out4f[x*4+0] = c[0];
2371                         out4f[x*4+1] = c[1];
2372                         out4f[x*4+2] = c[2];
2373                         out4f[x*4+3] = c[3];
2374                 }
2375                 return;
2376         }
2377         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2378         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2379         flags = texture->flags;
2380         tcscale[0] = texture->mipmap[mip][2];
2381         tcscale[1] = texture->mipmap[mip][3];
2382         tciwidth = texture->mipmap[mip][2];
2383         tcimin[0] = 0;
2384         tcimin[1] = 0;
2385         tcimax[0] = texture->mipmap[mip][2]-1;
2386         tcimax[1] = texture->mipmap[mip][3]-1;
2387         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2388         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2389         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
2390         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
2391         for (x = startx;x < endx;)
2392         {
2393                 unsigned int subtc[2];
2394                 unsigned int substep[2];
2395                 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2396                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2397                 if (nextsub >= endx)
2398                 {
2399                         nextsub = endsub = endx-1;      
2400                         if (x < nextsub) subscale = 65536.0f / (nextsub - x);
2401                 }
2402                 tc[0] = endtc[0];
2403                 tc[1] = endtc[1];
2404                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
2405                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
2406                 substep[0] = (endtc[0] - tc[0]) * subscale;
2407                 substep[1] = (endtc[1] - tc[1]) * subscale;
2408                 subtc[0] = tc[0] * (1<<16);
2409                 subtc[1] = tc[1] * (1<<16);
2410                 if (filter)
2411                 {
2412                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2413                         {
2414                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2415                                 {
2416                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2417                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2418                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2419                                         tci[0] = subtc[0]>>16;
2420                                         tci[1] = subtc[1]>>16;
2421                                         tci1[0] = tci[0] + 1;
2422                                         tci1[1] = tci[1] + 1;
2423                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2424                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2425                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2426                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2427                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2428                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2429                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2430                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2431                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2432                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2433                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2434                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2435                                         out4f[x*4+0] = c[0];
2436                                         out4f[x*4+1] = c[1];
2437                                         out4f[x*4+2] = c[2];
2438                                         out4f[x*4+3] = c[3];
2439                                 }
2440                         }
2441                         else
2442                         {
2443                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2444                                 {
2445                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2446                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2447                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2448                                         tci[0] = subtc[0]>>16;
2449                                         tci[1] = subtc[1]>>16;
2450                                         tci1[0] = tci[0] + 1;
2451                                         tci1[1] = tci[1] + 1;
2452                                         tci[0] &= tciwrapmask[0];
2453                                         tci[1] &= tciwrapmask[1];
2454                                         tci1[0] &= tciwrapmask[0];
2455                                         tci1[1] &= tciwrapmask[1];
2456                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2457                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2458                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2459                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2460                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2461                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2462                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2463                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2464                                         out4f[x*4+0] = c[0];
2465                                         out4f[x*4+1] = c[1];
2466                                         out4f[x*4+2] = c[2];
2467                                         out4f[x*4+3] = c[3];
2468                                 }
2469                         }
2470                 }
2471                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2472                 {
2473                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2474                         {
2475                                 tci[0] = subtc[0]>>16;
2476                                 tci[1] = subtc[1]>>16;
2477                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2478                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2479                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2480                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2481                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2482                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2483                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2484                                 out4f[x*4+0] = c[0];
2485                                 out4f[x*4+1] = c[1];
2486                                 out4f[x*4+2] = c[2];
2487                                 out4f[x*4+3] = c[3];
2488                         }
2489                 }
2490                 else
2491                 {
2492                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2493                         {
2494                                 tci[0] = subtc[0]>>16;
2495                                 tci[1] = subtc[1]>>16;
2496                                 tci[0] &= tciwrapmask[0];
2497                                 tci[1] &= tciwrapmask[1];
2498                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2499                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2500                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2501                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2502                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2503                                 out4f[x*4+0] = c[0];
2504                                 out4f[x*4+1] = c[1];
2505                                 out4f[x*4+2] = c[2];
2506                                 out4f[x*4+3] = c[3];
2507                         }
2508                 }
2509         }
2510 }
2511
2512 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2513 {
2514 #ifdef SSE_POSSIBLE
2515         int x;
2516         int startx = span->startx;
2517         int endx = span->endx;
2518         int flags;
2519         __m128 data, slope, tcscale;
2520         __m128i tcsize, tcmask, tcoffset, tcmax;
2521         __m128 tc, endtc;
2522         __m128i subtc, substep, endsubtc;
2523         int filter;
2524         int mip;
2525         unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2526         const unsigned char * RESTRICT pixelbase;
2527         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2528         // if no texture is bound, just fill it with white
2529         if (!texture)
2530         {
2531                 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2532                 return;
2533         }
2534         mip = triangle->mip[texunitindex];
2535         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2536         // if this mipmap of the texture is 1 pixel, just fill it with that color
2537         if (texture->mipmap[mip][1] == 4)
2538         {
2539                 unsigned int k = *((const unsigned int *)pixelbase);
2540                 for (x = startx;x < endx;x++)
2541                         outi[x] = k;
2542                 return;
2543         }
2544         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2545         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2546         flags = texture->flags;
2547         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2548         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2549         tcscale = _mm_cvtepi32_ps(tcsize);
2550         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2551         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2552         endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
2553         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2554         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2555         tcmax = _mm_packs_epi32(tcmask, tcmask);
2556         for (x = startx;x < endx;)
2557         {
2558                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2559                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2560                 if (nextsub >= endx)
2561                 {
2562                         nextsub = endsub = endx-1;
2563                         if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2564                 }       
2565                 tc = endtc;
2566                 subtc = endsubtc;
2567                 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
2568                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2569                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2570                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2571                 substep = _mm_slli_epi32(substep, 1);
2572                 if (filter)
2573                 {
2574                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2575                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2576                         {
2577                                 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2578                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2579                                 {
2580                                         const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;                   
2581                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2582                                         tci = _mm_madd_epi16(tci, tcoffset);
2583                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2584                                         ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2585                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2586                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2587                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2588                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2589                                         fracm = _mm_srli_epi16(subtc, 1);
2590                                         pix1 = _mm_add_epi16(pix1,
2591                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2592                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2593                                         pix3 = _mm_add_epi16(pix3,
2594                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2595                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2596                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2597                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2598                                         pix2 = _mm_add_epi16(pix2,
2599                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2600                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2601                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2602                                 }
2603                                 if (x <= endsub)
2604                                 {
2605                                         const unsigned char * RESTRICT ptr1;
2606                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2607                                         tci = _mm_madd_epi16(tci, tcoffset);
2608                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2609                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2610                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2611                                         fracm = _mm_srli_epi16(subtc, 1);
2612                                         pix1 = _mm_add_epi16(pix1,
2613                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2614                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2615                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2616                                         pix1 = _mm_add_epi16(pix1,
2617                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2618                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2619                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2620                                         x++;
2621                                 }
2622                         }
2623                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2624                         {
2625                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2626                                 {
2627                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2628                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2629                                         tci = _mm_madd_epi16(tci, tcoffset);
2630                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2631                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2632                                                                                         _mm_setzero_si128());
2633                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2634                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2635                                                                                         _mm_setzero_si128());
2636                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2637                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2638                                         tci = _mm_madd_epi16(tci, tcoffset);
2639                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2640                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2641                                                                                         _mm_setzero_si128());
2642                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2643                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2644                                                                                         _mm_setzero_si128());
2645                                         fracm = _mm_srli_epi16(subtc, 1);
2646                                         pix1 = _mm_add_epi16(pix1,
2647                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2648                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2649                                         pix3 = _mm_add_epi16(pix3,
2650                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2651                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2652                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2653                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2654                                         pix2 = _mm_add_epi16(pix2,
2655                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2656                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2657                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2658                                 }
2659                                 if (x <= endsub)
2660                                 {
2661                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2662                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2663                                         tci = _mm_madd_epi16(tci, tcoffset);
2664                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2665                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2666                                                                                         _mm_setzero_si128());
2667                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2668                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2669                                                                                         _mm_setzero_si128());
2670                                         fracm = _mm_srli_epi16(subtc, 1);
2671                                         pix1 = _mm_add_epi16(pix1,
2672                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2673                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2674                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2675                                         pix1 = _mm_add_epi16(pix1,
2676                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2677                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2678                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2679                                         x++;
2680                                 }
2681                         }
2682                         else
2683                         {
2684                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2685                                 {
2686                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2687                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2688                                         tci = _mm_madd_epi16(tci, tcoffset);
2689                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2690                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2691                                                                                         _mm_setzero_si128());
2692                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2693                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2694                                                                                         _mm_setzero_si128());
2695                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2696                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2697                                         tci = _mm_madd_epi16(tci, tcoffset);
2698                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2699                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2700                                                                                         _mm_setzero_si128());
2701                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2702                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2703                                                                                         _mm_setzero_si128());
2704                                         fracm = _mm_srli_epi16(subtc, 1);
2705                                         pix1 = _mm_add_epi16(pix1,
2706                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2707                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2708                                         pix3 = _mm_add_epi16(pix3,
2709                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2710                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2711                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2712                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2713                                         pix2 = _mm_add_epi16(pix2,
2714                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2715                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2716                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2717                                 }
2718                                 if (x <= endsub)
2719                                 {
2720                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2721                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2722                                         tci = _mm_madd_epi16(tci, tcoffset);
2723                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2724                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2725                                                                                         _mm_setzero_si128());
2726                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2727                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2728                                                                                         _mm_setzero_si128());
2729                                         fracm = _mm_srli_epi16(subtc, 1);
2730                                         pix1 = _mm_add_epi16(pix1,
2731                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2732                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2733                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2734                                         pix1 = _mm_add_epi16(pix1,
2735                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2736                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2737                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2738                                         x++;
2739                                 }
2740                         }
2741                 }
2742                 else
2743                 {
2744                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2745                         {
2746                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2747                                 {
2748                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2749                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2750                                         tci = _mm_madd_epi16(tci, tcoffset);
2751                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2752                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2753                                 }
2754                                 if (x <= endsub)
2755                                 {
2756                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2757                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2758                                         tci = _mm_madd_epi16(tci, tcoffset);
2759                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2760                                         x++;
2761                                 }
2762                         }
2763                         else
2764                         {
2765                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2766                                 {
2767                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2768                                         tci = _mm_and_si128(tci, tcmax); 
2769                                         tci = _mm_madd_epi16(tci, tcoffset);
2770                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2771                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2772                                 }
2773                                 if (x <= endsub)
2774                                 {
2775                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2776                                         tci = _mm_and_si128(tci, tcmax); 
2777                                         tci = _mm_madd_epi16(tci, tcoffset);
2778                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2779                                         x++;
2780                                 }
2781                         }
2782                 }
2783         }
2784 #endif
2785 }
2786
2787 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2788 {
2789         // TODO: IMPLEMENT
2790         memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2791 }
2792
2793 float DPSOFTRAST_SampleShadowmap(const float *vector)
2794 {
2795         // TODO: IMPLEMENT
2796         return 1.0f;
2797 }
2798
2799 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2800 {
2801         int x;
2802         int startx = span->startx;
2803         int endx = span->endx;
2804         float c[4];
2805         float data[4];
2806         float slope[4];
2807         float z;
2808         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2809         for (x = startx;x < endx;x++)
2810         {
2811                 z = zf[x];
2812                 c[0] = (data[0] + slope[0]*x) * z;
2813                 c[1] = (data[1] + slope[1]*x) * z;
2814                 c[2] = (data[2] + slope[2]*x) * z;
2815                 c[3] = (data[3] + slope[3]*x) * z;
2816                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2817                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2818                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2819                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2820         }
2821 }
2822
2823 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2824 {
2825         int x;
2826         int startx = span->startx;
2827         int endx = span->endx;
2828         float c[4];
2829         float data[4];
2830         float slope[4];
2831         float z;
2832         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2833         for (x = startx;x < endx;x++)
2834         {
2835                 z = zf[x];
2836                 c[0] = (data[0] + slope[0]*x) * z;
2837                 c[1] = (data[1] + slope[1]*x) * z;
2838                 c[2] = (data[2] + slope[2]*x) * z;
2839                 c[3] = (data[3] + slope[3]*x) * z;
2840                 out4f[x*4+0] = c[0];
2841                 out4f[x*4+1] = c[1];
2842                 out4f[x*4+2] = c[2];
2843                 out4f[x*4+3] = c[3];
2844         }
2845 }
2846
2847 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2848 {
2849         int x, startx = span->startx, endx = span->endx;
2850         float c[4], localcolor[4];
2851         localcolor[0] = subcolor[0];
2852         localcolor[1] = subcolor[1];
2853         localcolor[2] = subcolor[2];
2854         localcolor[3] = subcolor[3];
2855         for (x = startx;x < endx;x++)
2856         {
2857                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2858                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2859                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2860                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2861                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2862                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2863                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2864                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2865         }
2866 }
2867
2868 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2869 {
2870         int x, startx = span->startx, endx = span->endx;
2871         for (x = startx;x < endx;x++)
2872         {
2873                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2874                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2875                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2876                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2877         }
2878 }
2879
2880 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2881 {
2882         int x, startx = span->startx, endx = span->endx;
2883         for (x = startx;x < endx;x++)
2884         {
2885                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2886                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2887                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2888                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2889         }
2890 }
2891
2892 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2893 {
2894         int x, startx = span->startx, endx = span->endx;
2895         float a, b;
2896         for (x = startx;x < endx;x++)
2897         {
2898                 a = 1.0f - inb4f[x*4+3];
2899                 b = inb4f[x*4+3];
2900                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2901                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2902                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2903                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2904         }
2905 }
2906
2907 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2908 {
2909         int x, startx = span->startx, endx = span->endx;
2910         float localcolor[4], ilerp, lerp;
2911         localcolor[0] = color[0];
2912         localcolor[1] = color[1];
2913         localcolor[2] = color[2];
2914         localcolor[3] = color[3];
2915         ilerp = 1.0f - localcolor[3];
2916         lerp = localcolor[3];
2917         for (x = startx;x < endx;x++)
2918         {
2919                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2920                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2921                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2922                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2923         }
2924 }
2925
2926
2927
2928 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2929 {
2930 #ifdef SSE_POSSIBLE
2931         int x;
2932         int startx = span->startx;
2933         int endx = span->endx;
2934         __m128 data, slope;
2935         __m128 mod, endmod;
2936         __m128i submod, substep, endsubmod;
2937         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2938         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2939         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2940         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2941         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2942         for (x = startx; x < endx;)
2943         {
2944                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2945                 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2946                 if (nextsub >= endx)
2947                 {
2948                         nextsub = endsub = endx-1;
2949                         if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2950                 }
2951                 mod = endmod;
2952                 submod = endsubmod;
2953                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2954                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2955                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2956                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2957                 substep = _mm_packs_epi32(substep, substep);
2958                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2959                 {
2960                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2961                         pix = _mm_mulhi_epu16(pix, submod);
2962                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2963                 }
2964                 if (x <= endsub)
2965                 {
2966                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2967                         pix = _mm_mulhi_epu16(pix, submod);
2968                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2969                         x++;
2970                 }
2971         }
2972 #endif
2973 }
2974
2975 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2976 {
2977 #ifdef SSE_POSSIBLE
2978         int x;
2979         int startx = span->startx;
2980         int endx = span->endx;
2981         __m128 data, slope;
2982         __m128 mod, endmod;
2983         __m128i submod, substep, endsubmod;
2984         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2985         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2986         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2987         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2988         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2989         for (x = startx; x < endx;)
2990         {
2991                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2992                 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2993                 if (nextsub >= endx)
2994                 {
2995                         nextsub = endsub = endx-1;
2996                         if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
2997                 }
2998                 mod = endmod;
2999                 submod = endsubmod;
3000                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3001                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3002                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3003                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3004                 substep = _mm_packs_epi32(substep, substep);
3005                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3006                 {
3007                         __m128i pix = _mm_srai_epi16(submod, 4);
3008                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3009                 }
3010                 if (x <= endsub)
3011                 {
3012                         __m128i pix = _mm_srai_epi16(submod, 4);
3013                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3014                         x++;
3015                 }
3016         }
3017 #endif
3018 }
3019
3020 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3021 {
3022 #ifdef SSE_POSSIBLE
3023         int x, startx = span->startx, endx = span->endx;
3024         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3025         localcolor = _mm_packs_epi32(localcolor, localcolor);
3026         for (x = startx;x+2 <= endx;x+=2)
3027         {
3028                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3029                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3030                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3031                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3032         }
3033         if (x < endx)
3034         {
3035                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3036                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3037                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3038                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3039         }
3040 #endif
3041 }
3042
3043 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3044 {
3045 #ifdef SSE_POSSIBLE
3046         int x, startx = span->startx, endx = span->endx;
3047         for (x = startx;x+2 <= endx;x+=2)
3048         {
3049                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3050                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3051                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3052                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3053         }
3054         if (x < endx)
3055         {
3056                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3057                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3058                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3059                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3060         }
3061 #endif
3062 }
3063
3064 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3065 {
3066 #ifdef SSE_POSSIBLE
3067         int x, startx = span->startx, endx = span->endx;
3068         for (x = startx;x+2 <= endx;x+=2)
3069         {
3070                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3071                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3072                 pix1 = _mm_add_epi16(pix1, pix2);
3073                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3074         }
3075         if (x < endx)
3076         {
3077                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3078                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3079                 pix1 = _mm_add_epi16(pix1, pix2);
3080                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3081         }
3082 #endif
3083 }
3084
3085 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3086 {
3087 #ifdef SSE_POSSIBLE
3088         int x, startx = span->startx, endx = span->endx;
3089         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3090         tint = _mm_packs_epi32(tint, tint);
3091         for (x = startx;x+2 <= endx;x+=2)
3092         {
3093                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3094                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3095                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3096                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3097         }
3098         if (x < endx)
3099         {
3100                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3101                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3102                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3103                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3104         }
3105 #endif
3106 }
3107
3108 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3109 {
3110 #ifdef SSE_POSSIBLE
3111         int x, startx = span->startx, endx = span->endx;
3112         for (x = startx;x+2 <= endx;x+=2)
3113         {
3114                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3115                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3116                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3117                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3118                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3119         }
3120         if (x < endx)
3121         {
3122                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3123                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3124                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3125                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3126                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3127         }
3128 #endif
3129 }
3130
3131 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3132 {
3133 #ifdef SSE_POSSIBLE
3134         int x, startx = span->startx, endx = span->endx;
3135         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3136         localcolor = _mm_packs_epi32(localcolor, localcolor);
3137         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3138         for (x = startx;x+2 <= endx;x+=2)
3139         {
3140                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3141                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3142                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3143         }
3144         if (x < endx)
3145         {
3146                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3147                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3148                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3149         }
3150 #endif
3151 }
3152
3153
3154
3155 void DPSOFTRAST_VertexShader_Generic(void)
3156 {
3157         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3158         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3159         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3160         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3161                 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3162 }
3163
3164 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3165 {
3166         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3167         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3168         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3169         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3170         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3171         if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3172         {
3173                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3174                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3175                 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3176                 {
3177                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3178                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3179                         {
3180                                 // multiply
3181                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3182                         }
3183                         else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3184                         {
3185                                 // add
3186                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3187                         }
3188                         else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3189                         {
3190                                 // alphablend
3191                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3192                         }
3193                 }
3194         }
3195         else
3196                 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3197         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3198 }
3199
3200
3201
3202 void DPSOFTRAST_VertexShader_PostProcess(void)
3203 {
3204         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3205         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3206         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3207 }
3208
3209 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3210 {
3211         // TODO: optimize!!  at the very least there is no reason to use texture sampling on the frame texture
3212         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3213         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3214         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3215         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3216         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3217         if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3218         {
3219                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3220                 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3221         }
3222         DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3223         if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3224         {
3225                 // TODO: implement saturation
3226         }
3227         if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3228         {
3229                 // TODO: implement gammaramps
3230         }
3231         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3232 }
3233
3234
3235
3236 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3237 {
3238         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3239 }
3240
3241 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3242 {
3243         // this is never called (because colormask is off when this shader is used)
3244         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3245         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3246         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3247         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3248         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3249 }
3250
3251
3252
3253 void DPSOFTRAST_VertexShader_FlatColor(void)
3254 {
3255         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3256         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3257 }
3258
3259 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3260 {
3261 #ifdef SSE_POSSIBLE
3262         unsigned char * RESTRICT pixelmask = span->pixelmask;
3263         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3264         int x, startx = span->startx, endx = span->endx;
3265         __m128i Color_Ambientm;
3266         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3267         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3268         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3269         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3270         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3271         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3272                 pixel = buffer_FragColorbgra8;
3273         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3274         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3275         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3276         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3277         for (x = startx;x < endx;x++)
3278         {
3279                 __m128i color, pix;
3280                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3281                 {
3282                         __m128i pix2;
3283                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3284                         pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3285                         pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3286                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3287                         x += 3;
3288                         continue;
3289                 }
3290                 if (!pixelmask[x])
3291                         continue;
3292                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3293                 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3294                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3295         }
3296         if (pixel == buffer_FragColorbgra8)
3297                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3298 #endif
3299 }
3300
3301
3302
3303 void DPSOFTRAST_VertexShader_VertexColor(void)
3304 {
3305         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3306         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3307         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3308 }
3309
3310 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3311 {
3312 #ifdef SSE_POSSIBLE
3313         unsigned char * RESTRICT pixelmask = span->pixelmask;
3314         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3315         int x, startx = span->startx, endx = span->endx;
3316         __m128i Color_Ambientm, Color_Diffusem;
3317         __m128 data, slope;
3318         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3319         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3320         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3321         int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3322         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3323         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3324         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3325                 pixel = buffer_FragColorbgra8;
3326         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3327         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3328         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3329         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3330         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3331         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3332         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3333         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3334         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3335         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3336         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3337         data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3338         slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3339         for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3340         {
3341                 __m128i color, mod, pix;
3342                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3343                 {
3344                         __m128i pix2, mod2;
3345                         __m128 z = _mm_loadu_ps(&buffer_z[x]);
3346                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3347                         mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3348                         data = _mm_add_ps(data, slope);
3349                         mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3350                         data = _mm_add_ps(data, slope);
3351                         mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3352                         data = _mm_add_ps(data, slope);
3353                         mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3354                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3355                                                                   _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3356                         pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3357                                                                    _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3358                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3359                         x += 3;
3360                         continue;
3361                 }
3362                 if (!pixelmask[x])
3363                         continue;
3364                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3365                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
3366                 mod = _mm_packs_epi32(mod, mod);
3367                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3368                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3369         }
3370         if (pixel == buffer_FragColorbgra8)
3371                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3372 #endif
3373 }
3374
3375
3376
3377 void DPSOFTRAST_VertexShader_Lightmap(void)
3378 {
3379         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3380         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3381         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3382 }
3383
3384 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3385 {
3386 #ifdef SSE_POSSIBLE
3387         unsigned char * RESTRICT pixelmask = span->pixelmask;
3388         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3389         int x, startx = span->startx, endx = span->endx;
3390         __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3391         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3392         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3393         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3394         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3395         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3396         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3397         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3398         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3399         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3400                 pixel = buffer_FragColorbgra8;
3401         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3402         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3403         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3404         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3405         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3406         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3407         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3408         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3409         {
3410                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3411                 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3412                 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3413                 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3414                 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3415                 for (x = startx;x < endx;x++)
3416                 {
3417                         __m128i color, lightmap, glow, pix;
3418                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3419                         {
3420                                 __m128i pix2;
3421                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3422                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3423                                 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3424                                 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3425                                                                                                         _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3426                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3427                                 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3428                                                                                                         _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3429                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3430                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3431                                 x += 3;
3432                                 continue;
3433                         }
3434                         if (!pixelmask[x])
3435                                 continue;
3436                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3437                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3438                         glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3439                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3440                         pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3441                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3442                 }
3443         }
3444         else
3445         {
3446                 for (x = startx;x < endx;x++)
3447                 {
3448                         __m128i color, lightmap, pix;
3449                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3450                         {
3451                                 __m128i pix2;
3452                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3453                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3454                                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3455                                                                           _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3456                                 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3457                                                                            _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3458                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3459                                 x += 3;
3460                                 continue;
3461                         }
3462                         if (!pixelmask[x]) 
3463                                 continue;
3464                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3465                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3466                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3467                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3468                 }
3469         }
3470         if (pixel == buffer_FragColorbgra8)
3471                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3472 #endif
3473 }
3474
3475
3476 void DPSOFTRAST_VertexShader_LightDirection(void);
3477 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3478
3479 void DPSOFTRAST_VertexShader_FakeLight(void)
3480 {
3481         DPSOFTRAST_VertexShader_LightDirection();
3482 }
3483
3484 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3485 {
3486         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3487 }
3488
3489
3490
3491 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3492 {
3493         DPSOFTRAST_VertexShader_LightDirection();
3494         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3495 }
3496
3497 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3498 {
3499         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3500 }
3501
3502
3503
3504 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3505 {
3506         DPSOFTRAST_VertexShader_LightDirection();
3507         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3508 }
3509
3510 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3511 {
3512         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3513 }
3514
3515
3516
3517 void DPSOFTRAST_VertexShader_LightDirection(void)
3518 {
3519         int i;
3520         int numvertices = dpsoftrast.numvertices;
3521         float LightDir[4];
3522         float LightVector[4];
3523         float EyePosition[4];
3524         float EyeVectorModelSpace[4];
3525         float EyeVector[4];
3526         float position[4];
3527         float svector[4];
3528         float tvector[4];
3529         float normal[4];
3530         LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3531         LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3532         LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3533         LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3534         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3535         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3536         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3537         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3538         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3539         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3540         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3541         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3542         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3543         for (i = 0;i < numvertices;i++)
3544         {
3545                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3546                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3547                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3548                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3549                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3550                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3551                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3552                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3553                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3554                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3555                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3556                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3557                 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3558                 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3559                 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3560                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3561                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3562                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3563                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3564                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3565                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3566                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3567                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3568                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3569                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3570                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3571                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3572                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3573                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3574         }
3575         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3576 }
3577
3578 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3579 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3580 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3581 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3582 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3583 #define DPSOFTRAST_Vector3Normalize(v)\
3584 do\
3585 {\
3586         float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3587         if (len)\
3588         {\
3589                 len = 1.0f / len;\
3590                 v[0] *= len;\
3591                 v[1] *= len;\
3592                 v[2] *= len;\
3593         }\
3594 }\
3595 while(0)
3596
3597 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3598 {
3599         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3600         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3601         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3602         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3603         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3604         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3605         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3606         unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3607         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3608         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3609         int x, startx = span->startx, endx = span->endx;
3610         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3611         float LightVectordata[4];
3612         float LightVectorslope[4];
3613         float EyeVectordata[4];
3614         float EyeVectorslope[4];
3615         float VectorSdata[4];
3616         float VectorSslope[4];
3617         float VectorTdata[4];
3618         float VectorTslope[4];
3619         float VectorRdata[4];
3620         float VectorRslope[4];
3621         float z;
3622         float diffusetex[4];
3623         float glosstex[4];
3624         float surfacenormal[4];
3625         float lightnormal[4];
3626         float lightnormal_modelspace[4];
3627         float eyenormal[4];
3628         float specularnormal[4];
3629         float diffuse;
3630         float specular;
3631         float SpecularPower;
3632         int d[4];
3633         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3634         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3635         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3636         Color_Glow[3] = 0.0f;
3637         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3638         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3639         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3640         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3641         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3642         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3643         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3644         Color_Pants[3] = 0.0f;
3645         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3646         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3647         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3648         Color_Shirt[3] = 0.0f;
3649         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3650         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3651         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3652         {
3653                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3654                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3655         }
3656         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3657         {
3658                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3659         }
3660         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3661         {
3662                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3663                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3664                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3665                 Color_Diffuse[3] = 0.0f;
3666                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3667                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3668                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3669                 LightColor[3] = 0.0f;
3670                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3671                 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3672                 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3673                 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3674                 Color_Specular[3] = 0.0f;
3675                 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3676                 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3677                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3678
3679                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3680                 {
3681                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3682                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3683                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3684                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3685                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3686                 }
3687                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3688                 {
3689                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3690                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3691                 }
3692                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3693                 {
3694                         // nothing of this needed
3695                 }
3696                 else
3697                 {
3698                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3699                 }
3700
3701                 for (x = startx;x < endx;x++)
3702                 {
3703                         z = buffer_z[x];
3704                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3705                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3706                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3707                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3708                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3709                         {
3710                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3711                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3712                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3713                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3714                         }
3715                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3716                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3717                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3718                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3719                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3720                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3721                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3722                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3723
3724                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3725                         {
3726                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3727                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3728                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3729                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3730
3731                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3732                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3733                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3734                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3735
3736                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3737                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3738                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3739                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3740
3741                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3742                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3743                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3744                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3745
3746                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3747                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3748
3749                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3750                                 {
3751                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3752                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3753                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3754                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3755                                 }
3756                         }
3757                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3758                         {
3759                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3760                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3761                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3762                                 {
3763                                         float f = 1.0f / 256.0f;
3764                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3765                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3766                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3767                                 }
3768                         }
3769                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3770                         {
3771                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3772                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3773                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3774                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3775
3776                                 LightColor[0] = 1.0;
3777                                 LightColor[1] = 1.0;
3778                                 LightColor[2] = 1.0;
3779                         }
3780                         else
3781                         {
3782                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3783                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3784                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3785                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3786                         }
3787
3788                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3789
3790                         if(thread->shader_exactspecularmath)
3791                         {
3792                                 // reflect lightnormal at surfacenormal, take the negative of that
3793                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3794                                 float f;
3795                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3796                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3797                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3798                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3799
3800                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
3801                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3802                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3803                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3804                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3805
3806                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3807                         }
3808                         else
3809                         {
3810                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3811                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3812                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3813                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3814
3815                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
3816                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
3817                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
3818                                 DPSOFTRAST_Vector3Normalize(specularnormal);
3819
3820                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3821                         }
3822
3823                         specular = pow(specular, SpecularPower * glosstex[3]);
3824                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3825                         {
3826                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3827                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3828                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3829                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3830                         }
3831                         else
3832                         {
3833                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3834                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3835                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3836                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3837                         }
3838
3839                         buffer_FragColorbgra8[x*4+0] = d[0];
3840                         buffer_FragColorbgra8[x*4+1] = d[1];
3841                         buffer_FragColorbgra8[x*4+2] = d[2];
3842                         buffer_FragColorbgra8[x*4+3] = d[3];
3843                 }
3844         }
3845         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3846         {
3847                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3848                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3849                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3850                 Color_Diffuse[3] = 0.0f;
3851                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3852                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3853                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3854                 LightColor[3] = 0.0f;
3855                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3856
3857                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3858                 {
3859                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3860                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3861                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3862                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3863                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3864                 }
3865                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3866                 {
3867                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3868                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3869                 }
3870                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3871                 {
3872                         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3873                 }
3874                 else
3875                 {
3876                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3877                 }
3878
3879                 for (x = startx;x < endx;x++)
3880                 {
3881                         z = buffer_z[x];
3882                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3883                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3884                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3885                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3886                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3887                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3888                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3889                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3890
3891                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3892                         {
3893                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3894                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3895                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3896                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3897
3898                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3899                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3900                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3901                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3902
3903                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3904                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3905                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3906                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3907
3908                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3909                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3910                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3911                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3912
3913                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3914                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3915
3916                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3917                                 {
3918                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3919                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3920                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3921                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3922                                 }
3923                         }
3924                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3925                         {
3926                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3927                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3928                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3929                                 {
3930                                         float f = 1.0f / 256.0f;
3931                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3932                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3933                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3934                                 }
3935                         }
3936                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3937                         {
3938                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3939                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3940                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3941                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3942
3943                                 LightColor[0] = 1.0;
3944                                 LightColor[1] = 1.0;
3945                                 LightColor[2] = 1.0;
3946                         }
3947                         else
3948                         {
3949                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3950                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3951                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3952                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3953                         }
3954
3955                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3956                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3957                         {
3958                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3959                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3960                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3961                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3962                         }
3963                         else
3964                         {
3965                                 d[0] = (int)(                                                + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3966                                 d[1] = (int)(                                                + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3967                                 d[2] = (int)(                                                + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3968                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3969                         }
3970                         buffer_FragColorbgra8[x*4+0] = d[0];
3971                         buffer_FragColorbgra8[x*4+1] = d[1];
3972                         buffer_FragColorbgra8[x*4+2] = d[2];
3973                         buffer_FragColorbgra8[x*4+3] = d[3];
3974                 }
3975         }
3976         else
3977         {
3978                 for (x = startx;x < endx;x++)
3979                 {
3980                         z = buffer_z[x];
3981                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3982                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3983                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3984                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3985
3986                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3987                         {
3988                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3989                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3990                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3991                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3992                         }
3993                         else
3994                         {
3995                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3996                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3997                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3998                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3999                         }
4000                         buffer_FragColorbgra8[x*4+0] = d[0];
4001                         buffer_FragColorbgra8[x*4+1] = d[1];
4002                         buffer_FragColorbgra8[x*4+2] = d[2];
4003                         buffer_FragColorbgra8[x*4+3] = d[3];
4004                 }
4005         }
4006         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4007 }
4008
4009
4010
4011 void DPSOFTRAST_VertexShader_LightSource(void)
4012 {
4013         int i;
4014         int numvertices = dpsoftrast.numvertices;
4015         float LightPosition[4];
4016         float LightVector[4];
4017         float LightVectorModelSpace[4];
4018         float EyePosition[4];
4019         float EyeVectorModelSpace[4];
4020         float EyeVector[4];
4021         float position[4];
4022         float svector[4];
4023         float tvector[4];
4024         float normal[4];
4025         LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4026         LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4027         LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4028         LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4029         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4030         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4031         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4032         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4033         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4034         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4035         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4036         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4037         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4038         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4039         for (i = 0;i < numvertices;i++)
4040         {
4041                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4042                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4043                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4044                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4045                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4046                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4047                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4048                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4049                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4050                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4051                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4052                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4053                 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4054                 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4055                 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4056                 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4057                 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4058                 LightVector[2] = normal[0]  * LightVectorModelSpace[0] + normal[1]  * LightVectorModelSpace[1] + normal[2]  * LightVectorModelSpace[2];
4059                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4060                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4061                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4062                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4063                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4064                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4065                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4066                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4067                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4068                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
4069                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4070                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4071                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4072                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4073         }
4074         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4075         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4076 }
4077
4078 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4079 {
4080 #ifdef SSE_POSSIBLE
4081         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4082         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4083         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4084         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4085         unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4086         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4087         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4088         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4089         int x, startx = span->startx, endx = span->endx;
4090         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4091         float CubeVectordata[4];
4092         float CubeVectorslope[4];
4093         float LightVectordata[4];
4094         float LightVectorslope[4];
4095         float EyeVectordata[4];
4096         float EyeVectorslope[4];
4097         float z;
4098         float diffusetex[4];
4099         float glosstex[4];
4100         float surfacenormal[4];
4101         float lightnormal[4];
4102         float eyenormal[4];
4103         float specularnormal[4];
4104         float diffuse;
4105         float specular;
4106         float SpecularPower;
4107         float CubeVector[4];
4108         float attenuation;
4109         int d[4];
4110         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4111         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4112         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4113         Color_Glow[3] = 0.0f;
4114         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4115         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4116         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4117         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4118         Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4119         Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4120         Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4121         Color_Diffuse[3] = 0.0f;
4122         Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4123         Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4124         Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4125         Color_Specular[3] = 0.0f;
4126         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4127         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4128         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4129         Color_Pants[3] = 0.0f;
4130         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4131         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4132         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4133         Color_Shirt[3] = 0.0f;
4134         LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4135         LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4136         LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4137         LightColor[3] = 0.0f;
4138         SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4139         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4140         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4141         DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4142         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4143         memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4144         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4145         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4146         {
4147                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4148                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4149         }
4150         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4151                 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4152         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4153         {
4154                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4155                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4156                 for (x = startx;x < endx;x++)
4157                 {
4158                         z = buffer_z[x];
4159                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4160                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4161                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4162                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4163                         if (attenuation < 0.01f)
4164                                 continue;
4165                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4166                         {
4167                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4168                                 if (attenuation < 0.01f)
4169                                         continue;
4170                         }
4171
4172                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4173                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4174                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4175                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4176                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4177                         {
4178                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4179                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4180                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4181                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4182                         }
4183                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4184                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4185                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4186                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4187                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4188                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4189                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4190                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4191
4192                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4193                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4194                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4195                         DPSOFTRAST_Vector3Normalize(lightnormal);
4196
4197                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4198
4199                         if(thread->shader_exactspecularmath)
4200                         {
4201                                 // reflect lightnormal at surfacenormal, take the negative of that
4202                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4203                                 float f;
4204                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4205                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4206                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4207                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4208
4209                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
4210                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4211                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4212                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4213                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4214
4215                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4216                         }
4217                         else
4218                         {
4219                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4220                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4221                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4222                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4223
4224                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
4225                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
4226                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
4227                                 DPSOFTRAST_Vector3Normalize(specularnormal);
4228
4229                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4230                         }
4231                         specular = pow(specular, SpecularPower * glosstex[3]);
4232
4233                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4234                         {
4235                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4236                                 attenuation *= (1.0f / 255.0f);
4237                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4238                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4239                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4240                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4241                         }
4242                         else
4243                         {
4244                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4245                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4246                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4247                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4248                         }
4249                         buffer_FragColorbgra8[x*4+0] = d[0];
4250                         buffer_FragColorbgra8[x*4+1] = d[1];
4251                         buffer_FragColorbgra8[x*4+2] = d[2];
4252                         buffer_FragColorbgra8[x*4+3] = d[3];
4253                 }
4254         }
4255         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4256         {
4257                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4258                 for (x = startx;x < endx;x++)
4259                 {
4260                         z = buffer_z[x];
4261                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4262                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4263                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4264                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4265                         if (attenuation < 0.01f)
4266                                 continue;
4267                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4268                         {
4269                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4270                                 if (attenuation < 0.01f)
4271                                         continue;
4272                         }
4273
4274                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4275                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4276                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4277                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4278                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4279                         {
4280                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4281                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4282                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4283                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4284                         }
4285                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4286                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4287                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4288                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4289
4290                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4291                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4292                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4293                         DPSOFTRAST_Vector3Normalize(lightnormal);
4294
4295                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4296                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4297                         {
4298                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4299                                 attenuation *= (1.0f / 255.0f);
4300                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4301                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4302                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4303                                 d[3] = (int)( diffusetex[3]                                                                                                                   );if (d[3] > 255) d[3] = 255;
4304                         }
4305                         else
4306                         {
4307                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4308                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4309                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4310                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4311                         }
4312                         buffer_FragColorbgra8[x*4+0] = d[0];
4313                         buffer_FragColorbgra8[x*4+1] = d[1];
4314                         buffer_FragColorbgra8[x*4+2] = d[2];
4315                         buffer_FragColorbgra8[x*4+3] = d[3];
4316                 }
4317         }
4318         else
4319         {
4320                 for (x = startx;x < endx;x++)
4321                 {
4322                         z = buffer_z[x];
4323                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4324                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4325                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4326                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4327                         if (attenuation < 0.01f)
4328                                 continue;
4329                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4330                         {
4331                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4332                                 if (attenuation < 0.01f)
4333                                         continue;
4334                         }
4335
4336                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4337                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4338                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4339                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4340                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4341                         {
4342                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4343                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4344                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4345                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4346                         }
4347                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4348                         {
4349                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4350                                 attenuation *= (1.0f / 255.0f);
4351                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4352                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4353                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4354                                 d[3] = (int)( diffusetex[3]                                                                                      );if (d[3] > 255) d[3] = 255;
4355                         }
4356                         else
4357                         {
4358                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4359                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4360                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4361                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4362                         }
4363                         buffer_FragColorbgra8[x*4+0] = d[0];
4364                         buffer_FragColorbgra8[x*4+1] = d[1];
4365                         buffer_FragColorbgra8[x*4+2] = d[2];
4366                         buffer_FragColorbgra8[x*4+3] = d[3];
4367                 }
4368         }
4369         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4370 #endif
4371 }
4372
4373
4374
4375 void DPSOFTRAST_VertexShader_Refraction(void)
4376 {
4377         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4378         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4379         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4380 }
4381
4382 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4383 {
4384         // DIRTY TRICK: only do sideways displacement. Not correct, but cheaper and thus better for SW.
4385
4386         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4387         float z;
4388         int x, startx = span->startx, endx = span->endx;
4389
4390         // texture reads
4391         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4392         //unsigned char buffer_texture_refractionbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4393         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4394
4395         // varyings
4396         float ModelViewProjectionPositiondata[4];
4397         float ModelViewProjectionPositionslope[4];
4398
4399         // uniforms
4400         float ScreenScaleRefractReflect[2];
4401         float ScreenCenterRefractReflect[2];
4402         float DistortScaleRefractReflect[2];
4403         float RefractColor[4];
4404
4405         const unsigned char * RESTRICT pixelbase;
4406         const unsigned char * RESTRICT pixel[4];
4407         DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4408         if(!texture) return;
4409         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[0][0];
4410
4411         // read textures
4412         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4413         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4414         //DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_refractionbgra8, GL20TU_REFRACTION, DPSOFTRAST_ARRAY_TEXCOORD1, buffer_z);
4415
4416         // read varyings
4417         DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD1); // or POSITION?
4418
4419         // read uniforms
4420         ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4421         ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4422         ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4423         ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4424         DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4425         DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4426         RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4427         RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4428         RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4429         RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4430
4431         // do stuff
4432         for (x = startx;x < endx;x++)
4433         {
4434                 float SafeScreenTexCoord[2];
4435                 float ScreenTexCoord[2];
4436                 float v[3];
4437                 float iw;
4438                 unsigned char c[4];
4439
4440                 z = buffer_z[x];
4441
4442                 // "    vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4443                 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4444         
4445                 // "    vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4446                 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4447                 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4448
4449                 // "    vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4450                 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4451                 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4452                 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4453                 DPSOFTRAST_Vector3Normalize(v);
4454                 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4455                 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4456
4457                 // "    dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4458                 if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4459                 {
4460                         unsigned int tc[2] = { ScreenTexCoord[0] * (texture->mipmap[0][2]<<16) - 32768, ScreenTexCoord[1] * (texture->mipmap[0][3]<<16) - 32678};
4461                         unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
4462                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
4463                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
4464                         int tci[2] = { tc[0]>>16, tc[1]>>16 };
4465                         int tci1[2] = { tci[0] + 1, tci[1] + 1 };
4466                         tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4467                         tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4468                         tci1[0] = tci1[0] >= 0 ? (tci1[0] <= texture->mipmap[0][2]-1 ? tci1[0] : texture->mipmap[0][2]-1) : 0;
4469                         tci1[1] = tci1[1] >= 0 ? (tci1[1] <= texture->mipmap[0][3]-1 ? tci1[1] : texture->mipmap[0][3]-1) : 0;
4470                         pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4471                         pixel[1] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci1[0]);
4472                         pixel[2] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci[0]);
4473                         pixel[3] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci1[0]);
4474                         c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
4475                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
4476                         c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
4477                 }
4478                 else
4479                 {
4480                         int tci[2] = { ScreenTexCoord[0] * texture->mipmap[0][2] - 0.5, ScreenTexCoord[1] * texture->mipmap[0][3] - 0.5 };
4481                         int tci1[2] = { tci[0] + 1, tci[1] + 1 };
4482                         tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4483                         tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4484                         tci1[0] = tci1[0] >= 0 ? (tci1[0] <= texture->mipmap[0][2]-1 ? tci1[0] : texture->mipmap[0][2]-1) : 0;
4485                         tci1[1] = tci1[1] >= 0 ? (tci1[1] <= texture->mipmap[0][3]-1 ? tci1[1] : texture->mipmap[0][3]-1) : 0;
4486                         pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4487                         c[0] = pixel[0][0];
4488                         c[1] = pixel[0][1];
4489                         c[2] = pixel[0][2];
4490                 }
4491
4492                 //p = (int) bound(startx, x + (ScreenTexCoord[0] - SafeScreenTexCoord[0]) / (ModelViewProjectionPositionslope[0]*z), endx-1);
4493                 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4494                 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4495                 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4496                 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4497         }
4498
4499         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4500 }
4501
4502
4503
4504 void DPSOFTRAST_VertexShader_Water(void)
4505 {
4506         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4507 }
4508
4509
4510 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4511 {
4512         // TODO: IMPLEMENT
4513         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4514         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4515         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4516         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4517         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4518 }
4519
4520
4521
4522 void DPSOFTRAST_VertexShader_ShowDepth(void)
4523 {
4524         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4525 }
4526
4527 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4528 {
4529         // TODO: IMPLEMENT
4530         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4531         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4532         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4533         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4534         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4535 }
4536
4537
4538
4539 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4540 {
4541         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4542 }
4543
4544 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4545 {
4546         // TODO: IMPLEMENT
4547         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4548         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4549         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4550         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4551         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4552 }
4553
4554
4555
4556 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4557 {
4558         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4559 }
4560
4561 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4562 {
4563         // TODO: IMPLEMENT
4564         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4565         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4566         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4567         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4568         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4569 }
4570
4571
4572
4573 typedef struct DPSOFTRAST_ShaderModeInfo_s
4574 {
4575         int lodarrayindex;
4576         void (*Vertex)(void);
4577         void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4578         unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4579         unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4580 }
4581 DPSOFTRAST_ShaderModeInfo;
4582
4583 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4584 {
4585         {2, DPSOFTRAST_VertexShader_Generic,                        DPSOFTRAST_PixelShader_Generic,                        {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4586         {2, DPSOFTRAST_VertexShader_PostProcess,                    DPSOFTRAST_PixelShader_PostProcess,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4587         {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow,                DPSOFTRAST_PixelShader_Depth_Or_Shadow,                {~0}, {~0}},
4588         {2, DPSOFTRAST_VertexShader_FlatColor,                      DPSOFTRAST_PixelShader_FlatColor,                      {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4589         {2, DPSOFTRAST_VertexShader_VertexColor,                    DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4590         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4591         {2, DPSOFTRAST_VertexShader_FakeLight,                      DPSOFTRAST_PixelShader_FakeLight,                      {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4592         {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace,   {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4593         {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4594         {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,                 {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4595         {2, DPSOFTRAST_VertexShader_LightSource,                    DPSOFTRAST_PixelShader_LightSource,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4596         {2, DPSOFTRAST_VertexShader_Refraction,                     DPSOFTRAST_PixelShader_Refraction,                     {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4597         {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                          {~0}},
4598         {2, DPSOFTRAST_VertexShader_ShowDepth,                      DPSOFTRAST_PixelShader_ShowDepth,                      {~0}},
4599         {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,               {~0}},
4600         {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}},
4601 };
4602
4603 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4604 {
4605         int i;
4606         int x;
4607         int startx;
4608         int endx;
4609 //      unsigned int c;
4610 //      unsigned int *colorpixel;
4611         unsigned int *depthpixel;
4612         float w;
4613         float wslope;
4614         int depth;
4615         int depthslope;
4616         unsigned int d;
4617         DPSOFTRAST_State_Triangle *triangle;
4618         DPSOFTRAST_State_Span *span;
4619         unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4620         for (i = 0; i < thread->numspans; i++)
4621         {
4622                 span = &thread->spans[i];
4623                 triangle = &thread->triangles[span->triangle];
4624                 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4625                 {
4626                         wslope = triangle->w[0];
4627                         w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4628                         depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4629                         depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4630                         depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4631                         startx = span->startx;
4632                         endx = span->endx;
4633                         switch(thread->fb_depthfunc)
4634                         {
4635                         default:
4636                         case GL_ALWAYS:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4637                         case GL_LESS:    for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4638                         case GL_LEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4639                         case GL_EQUAL:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4640                         case GL_GEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4641                         case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4642                         case GL_NEVER:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4643                         }
4644                         //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4645                         //for (x = startx;x < endx;x++)
4646                         //      colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4647                         // if there is no color buffer, skip pixel shader
4648                         while (startx < endx && !pixelmask[startx])
4649                                 startx++;
4650                         while (endx > startx && !pixelmask[endx-1])
4651                                 endx--;
4652                         if (startx >= endx)
4653                                 continue; // no pixels to fill
4654                         span->pixelmask = pixelmask;
4655                         span->startx = startx;
4656                         span->endx = endx;
4657                         // run pixel shader if appropriate
4658                         // do this before running depthmask code, to allow the pixelshader
4659                         // to clear pixelmask values for alpha testing
4660                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4661                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4662                         if (thread->depthmask)
4663                                 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4664                                         if (pixelmask[x])
4665                                                 depthpixel[x] = d;
4666                 }
4667                 else
4668                 {
4669                         // no depth testing means we're just dealing with color...
4670                         // if there is no color buffer, skip pixel shader
4671                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4672                         {
4673                                 memset(pixelmask + span->startx, 1, span->endx - span->startx);
4674                                 span->pixelmask = pixelmask;
4675                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4676                         }
4677                 }
4678         }
4679         thread->numspans = 0;
4680 }
4681
4682 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4683
4684 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4685 {
4686 #ifdef SSE_POSSIBLE
4687         int cullface = thread->cullface;
4688         int minx, maxx, miny, maxy;
4689         int miny1, maxy1, miny2, maxy2;
4690         __m128i fbmin, fbmax;
4691         __m128 viewportcenter, viewportscale;
4692         int firstvertex = command->firstvertex;
4693         int numvertices = command->numvertices;
4694         int numtriangles = command->numtriangles;
4695         const int *element3i = command->element3i;
4696         const unsigned short *element3s = command->element3s;
4697         int clipped = command->clipped;
4698         int i;
4699         int j;
4700         int k;
4701         int y;
4702         int e[3];
4703         __m128i screeny;
4704         int starty, endy, bandy;
4705         int numpoints;
4706         int clipcase;
4707         float clipdist[4];
4708         __m128 triangleedge1, triangleedge2, trianglenormal;
4709         __m128 clipfrac[3];
4710         __m128 screen[4];
4711         DPSOFTRAST_State_Triangle *triangle;
4712         DPSOFTRAST_Texture *texture;
4713         DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4714         miny = thread->fb_scissor[1];
4715         maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4716         miny1 = bound(miny, thread->miny1, maxy);
4717         maxy1 = bound(miny, thread->maxy1, maxy);
4718         miny2 = bound(miny, thread->miny2, maxy);
4719         maxy2 = bound(miny, thread->maxy2, maxy);
4720         if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4721         {
4722                 if (!ATOMIC_DECREMENT(command->refcount))
4723                 {
4724                         if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4725                                 MM_FREE(command->arrays);
4726                 }
4727                 return;
4728         }
4729         minx = thread->fb_scissor[0];
4730         maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4731         fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4732         fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4733         viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4734         viewportscale = _mm_load_ps(thread->fb_viewportscale);
4735         screen[3] = _mm_setzero_ps();
4736         clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4737         for (i = 0;i < numtriangles;i++)
4738         {
4739                 const float *screencoord4f = command->arrays;
4740                 const float *arrays = screencoord4f + numvertices*4;
4741
4742                 // generate the 3 edges of this triangle
4743                 // generate spans for the triangle - switch based on left split or right split classification of triangle
4744                 if (element3s)
4745                 {
4746                         e[0] = element3s[i*3+0] - firstvertex;
4747                         e[1] = element3s[i*3+1] - firstvertex;
4748                         e[2] = element3s[i*3+2] - firstvertex;
4749                 }
4750                 else if (element3i)
4751                 {
4752                         e[0] = element3i[i*3+0] - firstvertex;
4753                         e[1] = element3i[i*3+1] - firstvertex;
4754                         e[2] = element3i[i*3+2] - firstvertex;
4755                 }
4756                 else
4757                 {
4758                         e[0] = i*3+0;
4759                         e[1] = i*3+1;
4760                         e[2] = i*3+2;
4761                 }
4762
4763 #define SKIPBACKFACE \
4764                 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4765                 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4766                 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4767                 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4768                                                                         _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4769                 switch(cullface) \
4770                 { \
4771                 case GL_BACK: \
4772                         if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4773                                 continue; \
4774                         break; \
4775                 case GL_FRONT: \
4776                         if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4777                                 continue; \
4778                         break; \
4779                 }
4780
4781 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4782                         clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4783                         { \
4784                                 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4785                                 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4786                         }
4787 #define CLIPPEDVERTEXCOPY(k,p1) \
4788                         screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4789
4790 #define GENATTRIBCOPY(attrib, p1) \
4791                 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4792 #define GENATTRIBLERP(attrib, p1, p2) \
4793                 { \
4794                         __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4795                         attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4796                 }
4797 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4798                 switch(clipcase) \
4799                 { \
4800                 default: \
4801                 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4802                 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4803                 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4804                 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4805                 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4806                 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4807                 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4808                 }
4809
4810                 if (! clipped)
4811                         goto notclipped;
4812
4813                 // calculate distance from nearplane
4814                 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4815                 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4816                 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4817                 if (clipdist[0] >= 0.0f)
4818                 {
4819                         if (clipdist[1] >= 0.0f)
4820                         {
4821                                 if (clipdist[2] >= 0.0f)
4822                                 {
4823                                 notclipped:
4824                                         // triangle is entirely in front of nearplane
4825                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4826                                         SKIPBACKFACE;
4827                                         numpoints = 3;
4828                                         clipcase = 0;
4829                                 }
4830                                 else
4831                                 {
4832                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4833                                         SKIPBACKFACE;
4834                                         numpoints = 4;
4835                                         clipcase = 1;
4836                                 }
4837                         }
4838                         else
4839                         {
4840                                 if (clipdist[2] >= 0.0f)
4841                                 {
4842                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4843                                         SKIPBACKFACE;
4844                                         numpoints = 4;
4845                                         clipcase = 2;
4846                                 }
4847                                 else
4848                                 {
4849                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4850                                         SKIPBACKFACE;
4851                                         numpoints = 3;
4852                                         clipcase = 3;
4853                                 }
4854                         }
4855                 }
4856                 else if (clipdist[1] >= 0.0f)
4857                 {
4858                         if (clipdist[2] >= 0.0f)
4859                         {
4860                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4861                                 SKIPBACKFACE;
4862                                 numpoints = 4;
4863                                 clipcase = 4;
4864                         }
4865                         else
4866                         {
4867                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4868                                 SKIPBACKFACE;
4869                                 numpoints = 3;
4870                                 clipcase = 5;
4871                         }
4872                 }
4873                 else if (clipdist[2] >= 0.0f)
4874                 {
4875                         CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4876                         SKIPBACKFACE;
4877                         numpoints = 3;
4878                         clipcase = 6;
4879                 }
4880                 else continue; // triangle is entirely behind nearplane
4881
4882                 {
4883                         // calculate integer y coords for triangle points
4884                         __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4885                                         screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4886                                         screenmin = _mm_min_epi16(screeni, screenir),
4887                                         screenmax = _mm_max_epi16(screeni, screenir);
4888                         screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4889                         screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4890                         screenmin = _mm_max_epi16(screenmin, fbmin);
4891                         screenmax = _mm_min_epi16(screenmax, fbmax);
4892                         // skip offscreen triangles
4893                         if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4894                                 continue;
4895                         starty = _mm_extract_epi16(screenmin, 1);
4896                         endy = _mm_extract_epi16(screenmax, 1)+1;
4897                         if (starty >= maxy1 && endy <= miny2)
4898                                 continue;
4899                         screeny = _mm_srai_epi32(screeni, 16);
4900                 }
4901
4902                 triangle = &thread->triangles[thread->numtriangles];
4903
4904                 // calculate attribute plans for triangle data...
4905                 // okay, this triangle is going to produce spans, we'd better project
4906                 // the interpolants now (this is what gives perspective texturing),
4907                 // this consists of simply multiplying all arrays by the W coord
4908                 // (which is basically 1/Z), which will be undone per-pixel
4909                 // (multiplying by Z again) to get the perspective-correct array
4910                 // values
4911                 {
4912                         __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4913                         __m128 mipedgescale, mipdensity;
4914                         attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4915                         attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4916                         attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4917                         attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4918                         attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4919                         w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4920                         w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4921                         w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4922                         attribedge1 = _mm_sub_ss(w0, w1);
4923                         attribedge2 = _mm_sub_ss(w2, w1);
4924                         attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4925                         attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4926                         x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4927                         y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4928                         attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4929                         _mm_store_ss(&triangle->w[0], attribxslope);
4930                         _mm_store_ss(&triangle->w[1], attribyslope);
4931                         _mm_store_ss(&triangle->w[2], attriborigin);
4932                         mipedgescale = _mm_setzero_ps();
4933                         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4934                         {
4935                                 __m128 attrib0, attrib1, attrib2;
4936                                 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4937                                 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4938                                         break;
4939                                 arrays += numvertices*4;
4940                                 GENATTRIBS(attrib0, attrib1, attrib2);
4941                                 attriborigin = _mm_mul_ps(attrib1, w1);
4942                                 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4943                                 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4944                                 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4945                                 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4946                                 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4947                                 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
4948                                 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
4949                                 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
4950                                 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4951                                 {
4952                                         mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4953                                         mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4954                                         mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4955                                         mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4956                                 }
4957                         }
4958
4959                         memset(triangle->mip, 0, sizeof(triangle->mip));
4960                         for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4961                         {
4962                                 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4963                                 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4964                                         break;
4965                                 texture = thread->texbound[texunit];
4966                                 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4967                                 {
4968                                         mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4969                                         mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4970                                         mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4971                                         mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4972                                         // this will be multiplied in the texturing routine by the texture resolution
4973                                         y = _mm_cvtss_si32(mipdensity);
4974                                         if (y > 0)
4975                                         {
4976                                                 y = (int)(log((float)y)*0.5f/M_LN2);
4977                                                 if (y > texture->mipmaps - 1)
4978                                                         y = texture->mipmaps - 1;
4979                                                 triangle->mip[texunit] = y;
4980                                         }
4981                                 }
4982                         }
4983                 }
4984         
4985                 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
4986                 for (; y < bandy;)
4987                 {
4988                         __m128 xcoords, xslope;
4989                         __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
4990                         int yccmask = _mm_movemask_epi8(ycc);
4991                         int edge0p, edge0n, edge1p, edge1n;
4992                         int nexty;
4993                         if (numpoints == 4)
4994                         {
4995                                 switch(yccmask)
4996                                 {
4997                                 default:
4998                                 case 0xFFFF: /*0000*/ y = endy; continue;
4999                                 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
5000                                 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5001                                 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
5002                                 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
5003                                 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5004                                 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5005                                 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5006                                 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5007                                 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5008                                 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5009                                 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5010                                 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5011                                 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5012                                 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5013                                 case 0x0000: /*1111*/ y++; continue;
5014                                 }
5015                         }
5016                         else
5017                         {
5018                                 switch(yccmask)
5019                                 {
5020                                 default:
5021                                 case 0xFFFF: /*000*/ y = endy; continue;
5022                                 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5023                                 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5024                                 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5025                                 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5026                                 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5027                                 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5028                                 case 0x0000: /*111*/ y++; continue;
5029                                 }
5030                         }
5031                         ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5032                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5033                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5034                         nexty = _mm_extract_epi16(ycc, 0);
5035                         if (nexty >= bandy) nexty = bandy-1;
5036                         xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5037                         xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5038                         xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5039                                                                 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5040                         xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5041                         if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5042                         {
5043                                 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5044                                 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5045                         }
5046                         for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
5047                         {
5048                                 int startx, endx, offset;
5049                                 startx = _mm_cvtss_si32(xcoords);
5050                                 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5051                                 if (startx < minx) 
5052                                 {
5053                                         if (startx < 0) startx = 0;
5054                                         startx += (minx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
5055                                 }
5056                                 if (endx > maxx) endx = maxx;
5057                                 if (startx >= endx) continue;
5058                                 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5059                                 {
5060                                         DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5061                                         span->triangle = thread->numtriangles;
5062                                         span->x = offset;
5063                                         span->y = y;
5064                                         span->startx = max(minx - offset, 0);
5065                                         span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5066                                         if (span->startx >= span->endx)
5067                                                 continue; 
5068                                         if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5069                                                 DPSOFTRAST_Draw_ProcessSpans(thread);
5070                                 }
5071                         }
5072                 }
5073
5074                 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5075                 {
5076                         DPSOFTRAST_Draw_ProcessSpans(thread);
5077                         thread->numtriangles = 0;
5078                 }
5079         }
5080
5081         if (!ATOMIC_DECREMENT(command->refcount))
5082         {
5083                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5084                         MM_FREE(command->arrays);
5085         }
5086
5087         if (thread->numspans > 0 || thread->numtriangles > 0)
5088         {
5089                 DPSOFTRAST_Draw_ProcessSpans(thread);
5090                 thread->numtriangles = 0;
5091         }
5092 #endif
5093 }
5094
5095 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5096 {
5097         int i;
5098         int j;
5099         int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5100         int datasize = 2*numvertices*sizeof(float[4]);
5101         DPSOFTRAST_Command_Draw *command;
5102         unsigned char *data;
5103         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5104         {
5105                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5106                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5107                         break;
5108                 datasize += numvertices*sizeof(float[4]);
5109         }
5110         if (element3s)
5111                 datasize += numtriangles*sizeof(unsigned short[3]);
5112         else if (element3i)
5113                 datasize += numtriangles*sizeof(int[3]);
5114         datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5115         if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5116         {
5117                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5118                 data = (unsigned char *)MM_CALLOC(datasize, 1);
5119         }
5120         else
5121         {
5122                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5123                 data = (unsigned char *)command + commandsize;
5124         }
5125         command->firstvertex = firstvertex;
5126         command->numvertices = numvertices;
5127         command->numtriangles = numtriangles;
5128         command->arrays = (float *)data;
5129         memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5130         dpsoftrast.firstvertex = firstvertex;
5131         dpsoftrast.numvertices = numvertices;
5132         dpsoftrast.screencoord4f = (float *)data;
5133         data += numvertices*sizeof(float[4]);
5134         dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5135         data += numvertices*sizeof(float[4]);
5136         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5137         {
5138                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5139                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5140                         break;
5141                 dpsoftrast.post_array4f[j] = (float *)data;
5142                 data += numvertices*sizeof(float[4]);
5143         }
5144         command->element3i = NULL;
5145         command->element3s = NULL;
5146         if (element3s)
5147         {
5148                 command->element3s = (unsigned short *)data;
5149                 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5150         }
5151         else if (element3i)
5152         {
5153                 command->element3i = (int *)data;
5154                 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5155         }
5156         return command;
5157 }
5158
5159 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5160 {
5161         DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5162         DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5163         command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5164         command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5165         if (command->starty >= command->endy)
5166         {
5167                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5168                         MM_FREE(command->arrays);
5169                 DPSOFTRAST_UndoCommand(command->commandsize);
5170                 return;
5171         }
5172         command->clipped = dpsoftrast.drawclipped;
5173         command->refcount = dpsoftrast.numthreads;
5174
5175         if (dpsoftrast.usethreads)
5176         {
5177                 int i;
5178                 DPSOFTRAST_Draw_SyncCommands();
5179                 for (i = 0; i < dpsoftrast.numthreads; i++)
5180                 {
5181                         DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5182                         if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5183                                 Thread_CondSignal(thread->drawcond);
5184                 }
5185         }
5186         else
5187         {
5188                 DPSOFTRAST_Draw_FlushThreads();
5189         }
5190 }
5191
5192 DEFCOMMAND(23, SetRenderTargets, int width; int height;);
5193 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5194 {
5195         thread->validate |= DPSOFTRAST_VALIDATE_FB;
5196 }
5197 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5198 {
5199         DPSOFTRAST_Command_SetRenderTargets *command;
5200         if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5201                 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5202                 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5203                 DPSOFTRAST_Flush();
5204         dpsoftrast.fb_width = width;
5205         dpsoftrast.fb_height = height;
5206         dpsoftrast.fb_depthpixels = depthpixels;
5207         dpsoftrast.fb_colorpixels[0] = colorpixels0;
5208         dpsoftrast.fb_colorpixels[1] = colorpixels1;
5209         dpsoftrast.fb_colorpixels[2] = colorpixels2;
5210         dpsoftrast.fb_colorpixels[3] = colorpixels3;
5211         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5212         command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5213         command->width = width;
5214         command->height = height;
5215 }
5216  
5217 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5218 {
5219         int commandoffset = thread->commandoffset;
5220         while (commandoffset != endoffset)
5221         {
5222                 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5223                 switch (command->opcode)
5224                 {
5225 #define INTERPCOMMAND(name) \
5226                 case DPSOFTRAST_OPCODE_##name : \
5227                         DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5228                         commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5229                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5230                                 commandoffset = 0; \
5231                         break;
5232                 INTERPCOMMAND(Viewport)
5233                 INTERPCOMMAND(ClearColor)
5234                 INTERPCOMMAND(ClearDepth)
5235                 INTERPCOMMAND(ColorMask)
5236                 INTERPCOMMAND(DepthTest)
5237                 INTERPCOMMAND(ScissorTest)
5238                 INTERPCOMMAND(Scissor)
5239                 INTERPCOMMAND(BlendFunc)
5240                 INTERPCOMMAND(BlendSubtract)
5241                 INTERPCOMMAND(DepthMask)
5242                 INTERPCOMMAND(DepthFunc)
5243                 INTERPCOMMAND(DepthRange)
5244                 INTERPCOMMAND(PolygonOffset)
5245                 INTERPCOMMAND(CullFace)
5246                 INTERPCOMMAND(AlphaTest)
5247                 INTERPCOMMAND(AlphaFunc)
5248                 INTERPCOMMAND(SetTexture)
5249                 INTERPCOMMAND(SetShader)
5250                 INTERPCOMMAND(Uniform4f)
5251                 INTERPCOMMAND(UniformMatrix4f)
5252                 INTERPCOMMAND(Uniform1i)
5253                 INTERPCOMMAND(SetRenderTargets)
5254
5255                 case DPSOFTRAST_OPCODE_Draw:
5256                         DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5257                         commandoffset += command->commandsize;
5258                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5259                                 commandoffset = 0;
5260                         thread->commandoffset = commandoffset;
5261                         break;
5262
5263                 case DPSOFTRAST_OPCODE_Reset:
5264                         commandoffset = 0;
5265                         break;
5266                 }
5267         }
5268         thread->commandoffset = commandoffset;
5269 }
5270
5271 static int DPSOFTRAST_Draw_Thread(void *data)
5272 {
5273         DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5274         while(thread->index >= 0)
5275         {
5276                 if (thread->commandoffset != dpsoftrast.drawcommand)
5277                 {
5278                         DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);      
5279                 }
5280                 else 
5281                 {
5282                         Thread_LockMutex(thread->drawmutex);
5283                         if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5284                         {
5285                                 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5286                                 thread->starving = true;
5287                                 Thread_CondWait(thread->drawcond, thread->drawmutex);
5288                                 thread->starving = false;
5289                         }
5290                         Thread_UnlockMutex(thread->drawmutex);
5291                 }
5292         }   
5293         return 0;
5294 }
5295
5296 static void DPSOFTRAST_Draw_FlushThreads(void)
5297 {
5298         DPSOFTRAST_State_Thread *thread;
5299         int i;
5300         DPSOFTRAST_Draw_SyncCommands();
5301         if (dpsoftrast.usethreads) 
5302         {
5303                 for (i = 0; i < dpsoftrast.numthreads; i++)
5304                 {
5305                         thread = &dpsoftrast.threads[i];
5306                         if (thread->commandoffset != dpsoftrast.drawcommand)
5307                         {
5308                                 Thread_LockMutex(thread->drawmutex);
5309                                 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5310                                         Thread_CondSignal(thread->drawcond);
5311                                 Thread_UnlockMutex(thread->drawmutex);
5312                         }
5313                 }
5314                 for (i = 0; i < dpsoftrast.numthreads; i++)
5315                 {
5316                         thread = &dpsoftrast.threads[i];
5317                         if (thread->commandoffset != dpsoftrast.drawcommand)
5318                         {
5319                                 Thread_LockMutex(thread->drawmutex);
5320                                 if (thread->commandoffset != dpsoftrast.drawcommand)
5321                                 {
5322                                         thread->waiting = true;
5323                                         Thread_CondWait(thread->waitcond, thread->drawmutex);
5324                                         thread->waiting = false;
5325                                 }
5326                                 Thread_UnlockMutex(thread->drawmutex);
5327                         }
5328                 }
5329         }
5330         else
5331         {
5332                 for (i = 0; i < dpsoftrast.numthreads; i++)
5333                 {
5334                         thread = &dpsoftrast.threads[i];
5335                         if (thread->commandoffset != dpsoftrast.drawcommand)
5336                                 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5337                 }
5338         }
5339         dpsoftrast.commandpool.usedcommands = 0;
5340 }
5341
5342 void DPSOFTRAST_Flush(void)
5343 {
5344         DPSOFTRAST_Draw_FlushThreads();
5345 }
5346
5347 void DPSOFTRAST_Finish(void)
5348 {
5349         DPSOFTRAST_Flush();
5350 }
5351
5352 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5353 {
5354         int i;
5355         union
5356         {
5357                 int i;
5358                 unsigned char b[4];
5359         }
5360         u;
5361         u.i = 1;
5362         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5363         dpsoftrast.bigendian = u.b[3];
5364         dpsoftrast.fb_width = width;
5365         dpsoftrast.fb_height = height;
5366         dpsoftrast.fb_depthpixels = depthpixels;
5367         dpsoftrast.fb_colorpixels[0] = colorpixels;
5368         dpsoftrast.fb_colorpixels[1] = NULL;
5369         dpsoftrast.fb_colorpixels[1] = NULL;
5370         dpsoftrast.fb_colorpixels[1] = NULL;
5371         dpsoftrast.viewport[0] = 0;
5372         dpsoftrast.viewport[1] = 0;
5373         dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5374         dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5375         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5376         dpsoftrast.texture_firstfree = 1;
5377         dpsoftrast.texture_end = 1;
5378         dpsoftrast.texture_max = 0;
5379         dpsoftrast.color[0] = 1;
5380         dpsoftrast.color[1] = 1;
5381         dpsoftrast.color[2] = 1;
5382         dpsoftrast.color[3] = 1;
5383         dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5384         dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5385         dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5386         dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5387         for (i = 0; i < dpsoftrast.numthreads; i++)
5388         {
5389                 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5390                 thread->index = i;
5391                 thread->cullface = GL_BACK;
5392                 thread->colormask[1] = 1;
5393                 thread->colormask[2] = 1;
5394                 thread->colormask[3] = 1;
5395                 thread->blendfunc[0] = GL_ONE;
5396                 thread->blendfunc[1] = GL_ZERO;
5397                 thread->depthmask = true;
5398                 thread->depthtest = true;
5399                 thread->depthfunc = GL_LEQUAL;
5400                 thread->scissortest = false;
5401                 thread->alphatest = false;
5402                 thread->alphafunc = GL_GREATER;
5403                 thread->alphavalue = 0.5f;
5404                 thread->viewport[0] = 0;
5405                 thread->viewport[1] = 0;
5406                 thread->viewport[2] = dpsoftrast.fb_width;
5407                 thread->viewport[3] = dpsoftrast.fb_height;
5408                 thread->scissor[0] = 0;
5409                 thread->scissor[1] = 0;
5410                 thread->scissor[2] = dpsoftrast.fb_width;
5411                 thread->scissor[3] = dpsoftrast.fb_height;
5412                 thread->depthrange[0] = 0;
5413                 thread->depthrange[1] = 1;
5414                 thread->polygonoffset[0] = 0;
5415                 thread->polygonoffset[1] = 0;
5416         
5417                 DPSOFTRAST_RecalcThread(thread);
5418         
5419                 thread->numspans = 0;
5420                 thread->numtriangles = 0;
5421                 thread->commandoffset = 0;
5422                 thread->waiting = false;
5423                 thread->starving = false;
5424            
5425                 thread->validate = -1;
5426                 DPSOFTRAST_Validate(thread, -1);
5427  
5428                 if (dpsoftrast.usethreads)
5429                 {
5430                         thread->waitcond = Thread_CreateCond();
5431                         thread->drawcond = Thread_CreateCond();
5432                         thread->drawmutex = Thread_CreateMutex();
5433                         thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5434                 }
5435         }
5436         return 0;
5437 }
5438
5439 void DPSOFTRAST_Shutdown(void)
5440 {
5441         int i;
5442         if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5443         {
5444                 DPSOFTRAST_State_Thread *thread;
5445                 for (i = 0; i < dpsoftrast.numthreads; i++)
5446                 {
5447                         thread = &dpsoftrast.threads[i];
5448                         Thread_LockMutex(thread->drawmutex);
5449                         thread->index = -1;
5450                         Thread_CondSignal(thread->drawcond);
5451                         Thread_UnlockMutex(thread->drawmutex);
5452                         Thread_WaitThread(thread->thread, 0);
5453                         Thread_DestroyCond(thread->waitcond);
5454                         Thread_DestroyCond(thread->drawcond);
5455                         Thread_DestroyMutex(thread->drawmutex);
5456                 }
5457         }
5458         for (i = 0;i < dpsoftrast.texture_end;i++)
5459                 if (dpsoftrast.texture[i].bytes)
5460                         MM_FREE(dpsoftrast.texture[i].bytes);
5461         if (dpsoftrast.texture)
5462                 free(dpsoftrast.texture);
5463         if (dpsoftrast.threads)
5464                 MM_FREE(dpsoftrast.threads);
5465         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5466 }
5467