]> git.xonotic.org Git - xonotic/darkplaces.git/blob - dpsoftrast.c
workaround for missing _mm_cvtss_f32 on some gcc versionsx
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "thread.h"
7 #include "dpsoftrast.h"
8
9 #ifdef _MSC_VER
10 #pragma warning(disable : 4324)
11 #endif
12
13 #ifndef __cplusplus
14 typedef qboolean bool;
15 #endif
16
17 #define ALIGN_SIZE 16
18 #define ATOMIC_SIZE 32
19
20 #ifdef SSE_POSSIBLE
21         #if defined(__APPLE__)
22                 #include <libkern/OSAtomic.h>
23                 #define ALIGN(var) var __attribute__((__aligned__(16)))
24                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
25                 #define MEMORY_BARRIER (_mm_sfence())
26                 #define ATOMIC_COUNTER volatile int32_t 
27                 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28                 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29                 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30         #elif defined(__GNUC__)
31                 #define ALIGN(var) var __attribute__((__aligned__(16)))
32                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
33                 #define MEMORY_BARRIER (_mm_sfence())
34                 //(__sync_synchronize())
35                 #define ATOMIC_COUNTER volatile int
36                 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
37                 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
38                 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
39         #elif defined(_MSC_VER)
40                 #define ALIGN(var) __declspec(align(16)) var
41                 #define ATOMIC(var) __declspec(align(32)) var
42                 #define MEMORY_BARRIER (_mm_sfence())
43                 //(MemoryBarrier())
44                 #define ATOMIC_COUNTER volatile LONG
45                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
46                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
47                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
48         #endif
49 #endif
50
51 #ifndef ALIGN
52 #define ALIGN(var) var
53 #endif
54 #ifndef ATOMIC
55 #define ATOMIC(var) var
56 #endif
57 #ifndef MEMORY_BARRIER
58 #define MEMORY_BARRIER ((void)0)
59 #endif
60 #ifndef ATOMIC_COUNTER
61 #define ATOMIC_COUNTER int
62 #endif
63 #ifndef ATOMIC_INCREMENT
64 #define ATOMIC_INCREMENT(counter) (++(counter))
65 #endif
66 #ifndef ATOMIC_DECREMENT
67 #define ATOMIC_DECREMENT(counter) (--(counter))
68 #endif
69 #ifndef ATOMIC_ADD
70 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
71 #endif
72
73 #ifdef SSE_POSSIBLE
74 #include <emmintrin.h>
75
76 #if defined(__GNUC__) && (__GNUC < 4 || __GNUC_MINOR__ < 6)
77         #define _mm_cvtss_f32(val) (__builtin_ia32_vec_ext_v4sf ((__v4sf)(val), 0))
78 #endif
79
80 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
81
82 static void *MM_CALLOC(size_t nmemb, size_t size)
83 {
84         void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
85         if (ptr != NULL) memset(ptr, 0, nmemb*size);
86         return ptr;
87 }
88
89 #define MM_FREE _mm_free
90 #else
91 #define MM_MALLOC(size) malloc(size)
92 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
93 #define MM_FREE free
94 #endif
95
96 typedef enum DPSOFTRAST_ARRAY_e
97 {
98         DPSOFTRAST_ARRAY_POSITION,
99         DPSOFTRAST_ARRAY_COLOR,
100         DPSOFTRAST_ARRAY_TEXCOORD0,
101         DPSOFTRAST_ARRAY_TEXCOORD1,
102         DPSOFTRAST_ARRAY_TEXCOORD2,
103         DPSOFTRAST_ARRAY_TEXCOORD3,
104         DPSOFTRAST_ARRAY_TEXCOORD4,
105         DPSOFTRAST_ARRAY_TEXCOORD5,
106         DPSOFTRAST_ARRAY_TEXCOORD6,
107         DPSOFTRAST_ARRAY_TEXCOORD7,
108         DPSOFTRAST_ARRAY_TOTAL
109 }
110 DPSOFTRAST_ARRAY;
111
112 typedef struct DPSOFTRAST_Texture_s
113 {
114         int flags;
115         int width;
116         int height;
117         int depth;
118         int sides;
119         DPSOFTRAST_TEXTURE_FILTER filter;
120         int mipmaps;
121         int size;
122         ATOMIC_COUNTER binds;
123         unsigned char *bytes;
124         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
125 }
126 DPSOFTRAST_Texture;
127
128 #define COMMAND_SIZE ALIGN_SIZE
129 #define COMMAND_ALIGN(var) ALIGN(var)
130
131 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
132 {
133         unsigned char opcode;
134         unsigned short commandsize;
135 }
136 DPSOFTRAST_Command);
137
138 enum { DPSOFTRAST_OPCODE_Reset = 0 };
139
140 #define DEFCOMMAND(opcodeval, name, fields) \
141         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
142         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
143         { \
144                 unsigned char opcode; \
145                 unsigned short commandsize; \
146                 fields \
147         } DPSOFTRAST_Command_##name );
148
149 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
150 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
151
152 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
153 {
154         int freecommand;
155         int usedcommands;
156         ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
157 }
158 DPSOFTRAST_State_Command_Pool);
159
160 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
161 {
162         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
163         float w[3];
164         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
165 }
166 DPSOFTRAST_State_Triangle);
167
168 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
169         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
170         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
171                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
172                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
173 }
174 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
175         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
176         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
177         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
178         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
179         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
180         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
181         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
182         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
183 }
184                                         
185 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
186
187 typedef ALIGN(struct DPSOFTRAST_State_Span_s
188 {
189         int triangle; // triangle this span was generated by
190         int x; // framebuffer x coord
191         int y; // framebuffer y coord
192         int startx; // usable range (according to pixelmask)
193         int endx; // usable range (according to pixelmask)
194         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
195         int depthbase; // depthbuffer value at x (add depthslope*startx to get first pixel's depthbuffer value)
196         int depthslope; // depthbuffer value pixel delta
197 }
198 DPSOFTRAST_State_Span);
199
200 #define DPSOFTRAST_DRAW_MAXSPANS 1024
201 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
202 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
203
204 #define DPSOFTRAST_VALIDATE_FB 1
205 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
206 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
207 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
208
209 typedef enum DPSOFTRAST_BLENDMODE_e
210 {
211         DPSOFTRAST_BLENDMODE_OPAQUE,
212         DPSOFTRAST_BLENDMODE_ALPHA,
213         DPSOFTRAST_BLENDMODE_ADDALPHA,
214         DPSOFTRAST_BLENDMODE_ADD,
215         DPSOFTRAST_BLENDMODE_INVMOD,
216         DPSOFTRAST_BLENDMODE_MUL,
217         DPSOFTRAST_BLENDMODE_MUL2,
218         DPSOFTRAST_BLENDMODE_SUBALPHA,
219         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
220         DPSOFTRAST_BLENDMODE_INVADD,
221         DPSOFTRAST_BLENDMODE_TOTAL
222 }
223 DPSOFTRAST_BLENDMODE;
224
225 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
226 {
227         void *thread;
228         int index;
229         
230         int cullface;
231         int colormask[4];
232         int blendfunc[2];
233         int blendsubtract;
234         int depthmask;
235         int depthtest;
236         int depthfunc;
237         int scissortest;
238         int alphatest;
239         int alphafunc;
240         float alphavalue;
241         int viewport[4];
242         int scissor[4];
243         float depthrange[2];
244         float polygonoffset[2];
245         float clipplane[4];
246         ALIGN(float fb_clipplane[4]);
247
248         int shader_mode;
249         int shader_permutation;
250         int shader_exactspecularmath;
251
252         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
253         
254         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
255         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
256
257         // DPSOFTRAST_VALIDATE_ flags
258         int validate;
259
260         // derived values (DPSOFTRAST_VALIDATE_FB)
261         int fb_colormask;
262         int fb_scissor[4];
263         ALIGN(float fb_viewportcenter[4]);
264         ALIGN(float fb_viewportscale[4]);
265
266         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
267         int fb_depthfunc;
268
269         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
270         int fb_blendmode;
271
272         // band boundaries
273         int miny1;
274         int maxy1;
275         int miny2;
276         int maxy2;
277
278         ATOMIC(volatile int commandoffset);
279
280         volatile bool waiting;
281         volatile bool starving;
282         void *waitcond;
283         void *drawcond;
284         void *drawmutex;
285
286         int numspans;
287         int numtriangles;
288         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
289         DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
290         unsigned char pixelmaskarray[DPSOFTRAST_DRAW_MAXSPANLENGTH+4]; // LordHavoc: padded to allow some termination bytes
291 }
292 DPSOFTRAST_State_Thread);
293
294 typedef ATOMIC(struct DPSOFTRAST_State_s
295 {
296         int fb_width;
297         int fb_height;
298         unsigned int *fb_depthpixels;
299         unsigned int *fb_colorpixels[4];
300
301         int viewport[4];
302         ALIGN(float fb_viewportcenter[4]);
303         ALIGN(float fb_viewportscale[4]);
304
305         float color[4];
306         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
307         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
308
309         const float *pointer_vertex3f;
310         const float *pointer_color4f;
311         const unsigned char *pointer_color4ub;
312         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
313         int stride_vertex;
314         int stride_color;
315         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
316         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
317         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
318
319         int firstvertex;
320         int numvertices;
321         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
322         float *screencoord4f;
323         int drawstarty;
324         int drawendy;
325         int drawclipped;
326         
327         int shader_mode;
328         int shader_permutation;
329         int shader_exactspecularmath;
330
331         int texture_max;
332         int texture_end;
333         int texture_firstfree;
334         DPSOFTRAST_Texture *texture;
335
336         int bigendian;
337
338         // error reporting
339         const char *errorstring;
340
341         bool usethreads;
342         int interlace;
343         int numthreads;
344         DPSOFTRAST_State_Thread *threads;
345
346         ATOMIC(volatile int drawcommand);
347
348         DPSOFTRAST_State_Command_Pool commandpool;
349 }
350 DPSOFTRAST_State);
351
352 DPSOFTRAST_State dpsoftrast;
353
354 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
355 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
356 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
357 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
358
359 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span);
360 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span);
361
362 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
363 {
364         fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
365         fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
366         fb_viewportcenter[3] = 0.5f;
367         fb_viewportcenter[0] = 0.0f;
368         fb_viewportscale[1] = 0.5f * viewport[2];
369         fb_viewportscale[2] = -0.5f * viewport[3];
370         fb_viewportscale[3] = 0.5f;
371         fb_viewportscale[0] = 1.0f;
372 }
373
374 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
375 {
376         if (dpsoftrast.interlace)
377         {
378                 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
379                 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
380                 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
381                 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
382         }
383         else
384         {
385                 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
386                 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
387         }
388 }
389
390 static void DPSOFTRAST_RecalcClipPlane(DPSOFTRAST_State_Thread *thread)
391 {
392         thread->fb_clipplane[0] = thread->clipplane[0] / thread->fb_viewportscale[1];
393         thread->fb_clipplane[1] = thread->clipplane[1] / thread->fb_viewportscale[2];
394         thread->fb_clipplane[2] = thread->clipplane[2] / thread->fb_viewportscale[3];
395         thread->fb_clipplane[3] = thread->clipplane[3] / thread->fb_viewportscale[0];
396         thread->fb_clipplane[3] -= thread->fb_viewportcenter[1]*thread->fb_clipplane[0] + thread->fb_viewportcenter[2]*thread->fb_clipplane[1] + thread->fb_viewportcenter[3]*thread->fb_clipplane[2] + thread->fb_viewportcenter[0]*thread->fb_clipplane[3];
397 }
398
399 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
400 {
401         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
402         // and viewport projection values
403         int x1, x2;
404         int y1, y2;
405         x1 = thread->scissor[0];
406         x2 = thread->scissor[0] + thread->scissor[2];
407         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
408         y2 = dpsoftrast.fb_height - thread->scissor[1];
409         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
410         if (x1 < 0) x1 = 0;
411         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
412         if (y1 < 0) y1 = 0;
413         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
414         thread->fb_scissor[0] = x1;
415         thread->fb_scissor[1] = y1;
416         thread->fb_scissor[2] = x2 - x1;
417         thread->fb_scissor[3] = y2 - y1;
418
419         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
420         DPSOFTRAST_RecalcClipPlane(thread);
421         DPSOFTRAST_RecalcThread(thread);
422 }
423
424 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
425 {
426         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
427 }
428
429 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
430 {
431         if (thread->blendsubtract)
432         {
433                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
434                 {
435                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
436                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
437                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
438                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
439                 }
440         }
441         else
442         {       
443                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
444                 {
445                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
446                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
447                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
448                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
449                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
450                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
451                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
452                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
453                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
454                 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
455                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
456                 }
457         }
458 }
459
460 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
461
462 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
463 {
464         mask &= thread->validate;
465         if (!mask)
466                 return;
467         if (mask & DPSOFTRAST_VALIDATE_FB)
468         {
469                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
470                 DPSOFTRAST_RecalcFB(thread);
471         }
472         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
473         {
474                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
475                 DPSOFTRAST_RecalcDepthFunc(thread);
476         }
477         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
478         {
479                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
480                 DPSOFTRAST_RecalcBlendFunc(thread);
481         }
482 }
483
484 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
485 {
486         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
487                 return &dpsoftrast.texture[index];
488         return NULL;
489 }
490
491 static void DPSOFTRAST_Texture_Grow(void)
492 {
493         DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
494         DPSOFTRAST_State_Thread *thread;
495         int i;
496         int j;
497         DPSOFTRAST_Flush();
498         // expand texture array as needed
499         if (dpsoftrast.texture_max < 1024)
500                 dpsoftrast.texture_max = 1024;
501         else
502                 dpsoftrast.texture_max *= 2;
503         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
504         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
505                 if (dpsoftrast.texbound[i])
506                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
507         for (j = 0; j < dpsoftrast.numthreads; j++)
508         {
509                 thread = &dpsoftrast.threads[j];
510                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
511                         if (thread->texbound[i])
512                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
513         }
514 }
515
516 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
517 {
518         int w;
519         int h;
520         int d;
521         int size;
522         int s;
523         int texnum;
524         int mipmaps;
525         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
526         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
527         DPSOFTRAST_Texture *texture;
528         if (width*height*depth < 1)
529         {
530                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
531                 return 0;
532         }
533         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
534         {
535                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
536                 return 0;
537         }
538         switch(texformat)
539         {
540         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
541         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
542         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
543                 break;
544         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
545                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
546                 {
547                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
548                         return 0;
549                 }
550                 if (depth != 1)
551                 {
552                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
553                         return 0;
554                 }
555                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
556                 {
557                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
558                         return 0;
559                 }
560                 break;
561         }
562         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
563         {
564                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
565                 return 0;
566         }
567         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
568         {
569                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
570                 return 0;
571         }
572         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
573         {
574                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
575                 return 0;
576         }
577         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
578         {
579                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
580                 return 0;
581         }
582         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
583         {
584                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
585                 return 0;
586         }
587         // find first empty slot in texture array
588         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
589                 if (!dpsoftrast.texture[texnum].bytes)
590                         break;
591         dpsoftrast.texture_firstfree = texnum + 1;
592         if (dpsoftrast.texture_max <= texnum)
593                 DPSOFTRAST_Texture_Grow();
594         if (dpsoftrast.texture_end <= texnum)
595                 dpsoftrast.texture_end = texnum + 1;
596         texture = &dpsoftrast.texture[texnum];
597         memset(texture, 0, sizeof(*texture));
598         texture->flags = flags;
599         texture->width = width;
600         texture->height = height;
601         texture->depth = depth;
602         texture->sides = sides;
603         texture->binds = 0;
604         w = width;
605         h = height;
606         d = depth;
607         size = 0;
608         mipmaps = 0;
609         w = width;
610         h = height;
611         d = depth;
612         for (;;)
613         {
614                 s = w * h * d * sides * 4;
615                 texture->mipmap[mipmaps][0] = size;
616                 texture->mipmap[mipmaps][1] = s;
617                 texture->mipmap[mipmaps][2] = w;
618                 texture->mipmap[mipmaps][3] = h;
619                 texture->mipmap[mipmaps][4] = d;
620                 size += s;
621                 mipmaps++;
622                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
623                         break;
624                 if (w > 1) w >>= 1;
625                 if (h > 1) h >>= 1;
626                 if (d > 1) d >>= 1;
627         }
628         texture->mipmaps = mipmaps;
629         texture->size = size;
630
631         // allocate the pixels now
632         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
633
634         return texnum;
635 }
636 void DPSOFTRAST_Texture_Free(int index)
637 {
638         DPSOFTRAST_Texture *texture;
639         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
640         if (texture->binds)
641                 DPSOFTRAST_Flush();
642         if (texture->bytes)
643                 MM_FREE(texture->bytes);
644         texture->bytes = NULL;
645         memset(texture, 0, sizeof(*texture));
646         // adjust the free range and used range
647         if (dpsoftrast.texture_firstfree > index)
648                 dpsoftrast.texture_firstfree = index;
649         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
650                 dpsoftrast.texture_end--;
651 }
652 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
653 {
654         int i, x, y, z, w, layer0, layer1, row0, row1;
655         unsigned char *o, *i0, *i1, *i2, *i3;
656         DPSOFTRAST_Texture *texture;
657         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
658         if (texture->mipmaps <= 1)
659                 return;
660         for (i = 1;i < texture->mipmaps;i++)
661         {
662                 for (z = 0;z < texture->mipmap[i][4];z++)
663                 {
664                         layer0 = z*2;
665                         layer1 = z*2+1;
666                         if (layer1 >= texture->mipmap[i-1][4])
667                                 layer1 = texture->mipmap[i-1][4]-1;
668                         for (y = 0;y < texture->mipmap[i][3];y++)
669                         {
670                                 row0 = y*2;
671                                 row1 = y*2+1;
672                                 if (row1 >= texture->mipmap[i-1][3])
673                                         row1 = texture->mipmap[i-1][3]-1;
674                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
675                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
676                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
677                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
678                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
679                                 w = texture->mipmap[i][2];
680                                 if (layer1 > layer0)
681                                 {
682                                         if (texture->mipmap[i-1][2] > 1)
683                                         {
684                                                 // average 3D texture
685                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
686                                                 {
687                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
688                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
689                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
690                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
691                                                 }
692                                         }
693                                         else
694                                         {
695                                                 // average 3D mipmap with parent width == 1
696                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
697                                                 {
698                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
699                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
700                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
701                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
702                                                 }
703                                         }
704                                 }
705                                 else
706                                 {
707                                         if (texture->mipmap[i-1][2] > 1)
708                                         {
709                                                 // average 2D texture (common case)
710                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
711                                                 {
712                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
713                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
714                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
715                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
716                                                 }
717                                         }
718                                         else
719                                         {
720                                                 // 2D texture with parent width == 1
721                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
722                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
723                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
724                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
725                                         }
726                                 }
727                         }
728                 }
729         }
730 }
731 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
732 {
733         DPSOFTRAST_Texture *texture;
734         unsigned char *dst;
735         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
736         if (texture->binds)
737                 DPSOFTRAST_Flush();
738         if (pixels)
739         {
740                 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
741                 while (blockheight > 0)
742                 {
743                         memcpy(dst, pixels, blockwidth * 4);
744                         pixels += blockwidth * 4;
745                         dst += texture->mipmap[0][2] * 4;
746                         blockheight--;
747                 }
748         }
749         DPSOFTRAST_Texture_CalculateMipmaps(index);
750 }
751 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
752 {
753         DPSOFTRAST_Texture *texture;
754         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
755         if (texture->binds)
756                 DPSOFTRAST_Flush();
757         if (pixels)
758                 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
759         DPSOFTRAST_Texture_CalculateMipmaps(index);
760 }
761 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
762 {
763         DPSOFTRAST_Texture *texture;
764         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
765         return texture->mipmap[mip][2];
766 }
767 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
768 {
769         DPSOFTRAST_Texture *texture;
770         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
771         return texture->mipmap[mip][3];
772 }
773 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
774 {
775         DPSOFTRAST_Texture *texture;
776         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
777         return texture->mipmap[mip][4];
778 }
779 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
780 {
781         DPSOFTRAST_Texture *texture;
782         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
783         if (texture->binds)
784                 DPSOFTRAST_Flush();
785         return texture->bytes + texture->mipmap[mip][0];
786 }
787 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
788 {
789         DPSOFTRAST_Texture *texture;
790         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
791         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
792         {
793                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
794                 return;
795         }
796         if (texture->binds)
797                 DPSOFTRAST_Flush();
798         texture->filter = filter;
799 }
800
801 static void DPSOFTRAST_Draw_FlushThreads(void);
802
803 static void DPSOFTRAST_Draw_SyncCommands(void)
804 {
805         if(dpsoftrast.usethreads) MEMORY_BARRIER;
806         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
807 }
808
809 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
810 {
811         DPSOFTRAST_State_Thread *thread;
812         int i;
813         int freecommand = dpsoftrast.commandpool.freecommand;
814         int usedcommands = dpsoftrast.commandpool.usedcommands;
815         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
816                 return;
817         DPSOFTRAST_Draw_SyncCommands();
818         for(;;)
819         {
820                 int waitindex = -1;
821                 int commandoffset;
822                 usedcommands = 0;
823                 for (i = 0; i < dpsoftrast.numthreads; i++)
824                 {
825                         thread = &dpsoftrast.threads[i]; 
826                         commandoffset = freecommand - thread->commandoffset;
827                         if (commandoffset < 0)
828                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
829                         if (commandoffset > usedcommands)
830                         {
831                                 waitindex = i;
832                                 usedcommands = commandoffset;
833                         }
834                 }
835                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
836                         break;
837                 thread = &dpsoftrast.threads[waitindex];
838                 Thread_LockMutex(thread->drawmutex);
839                 if (thread->commandoffset != dpsoftrast.drawcommand)
840                 {
841                         thread->waiting = true;
842                         if (thread->starving) Thread_CondSignal(thread->drawcond);
843                         Thread_CondWait(thread->waitcond, thread->drawmutex);
844                         thread->waiting = false;
845                 }
846                 Thread_UnlockMutex(thread->drawmutex);
847         }
848         dpsoftrast.commandpool.usedcommands = usedcommands;
849 }
850
851 #define DPSOFTRAST_ALIGNCOMMAND(size) \
852         ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
853 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
854         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
855
856 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
857 {
858         DPSOFTRAST_Command *command;
859         int freecommand = dpsoftrast.commandpool.freecommand;
860         int usedcommands = dpsoftrast.commandpool.usedcommands;
861         int extra = sizeof(DPSOFTRAST_Command);
862         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
863                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
864         if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
865         {
866                 if (dpsoftrast.usethreads)
867                         DPSOFTRAST_Draw_FreeCommandPool(size + extra);
868                 else
869                         DPSOFTRAST_Draw_FlushThreads();
870                 freecommand = dpsoftrast.commandpool.freecommand;
871                 usedcommands = dpsoftrast.commandpool.usedcommands;
872         }
873         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
874         {
875                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
876                 command->opcode = DPSOFTRAST_OPCODE_Reset;
877                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
878                 freecommand = 0;
879         }
880         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
881         command->opcode = opcode;
882         command->commandsize = size;
883         freecommand += size;
884         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
885                 freecommand = 0;
886         dpsoftrast.commandpool.freecommand = freecommand;
887         dpsoftrast.commandpool.usedcommands = usedcommands + size;
888         return command;
889 }
890
891 static void DPSOFTRAST_UndoCommand(int size)
892 {
893         int freecommand = dpsoftrast.commandpool.freecommand;
894         int usedcommands = dpsoftrast.commandpool.usedcommands;
895         freecommand -= size;
896         if (freecommand < 0)
897                 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
898         usedcommands -= size;
899         dpsoftrast.commandpool.freecommand = freecommand;
900         dpsoftrast.commandpool.usedcommands = usedcommands;
901 }
902                 
903 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
904 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
905 {
906         thread->viewport[0] = command->x;
907         thread->viewport[1] = command->y;
908         thread->viewport[2] = command->width;
909         thread->viewport[3] = command->height;
910         thread->validate |= DPSOFTRAST_VALIDATE_FB;
911 }
912 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
913 {
914         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
915         command->x = x;
916         command->y = y;
917         command->width = width;
918         command->height = height;
919
920         dpsoftrast.viewport[0] = x;
921         dpsoftrast.viewport[1] = y;
922         dpsoftrast.viewport[2] = width;
923         dpsoftrast.viewport[3] = height;
924         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
925 }
926
927 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
928 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
929 {
930         int i, x1, y1, x2, y2, w, h, x, y;
931         int miny1, maxy1, miny2, maxy2;
932         int bandy;
933         unsigned int *p;
934         unsigned int c;
935         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
936         miny1 = thread->miny1;
937         maxy1 = thread->maxy1;
938         miny2 = thread->miny2;
939         maxy2 = thread->maxy2;
940         x1 = thread->fb_scissor[0];
941         y1 = thread->fb_scissor[1];
942         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
943         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
944         if (y1 < miny1) y1 = miny1;
945         if (y2 > maxy2) y2 = maxy2;
946         w = x2 - x1;
947         h = y2 - y1;
948         if (w < 1 || h < 1)
949                 return;
950         // FIXME: honor fb_colormask?
951         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
952         for (i = 0;i < 4;i++)
953         {
954                 if (!dpsoftrast.fb_colorpixels[i])
955                         continue;
956                 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
957                 for (;y < bandy;y++)
958                 {
959                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
960                         for (x = x1;x < x2;x++)
961                                 p[x] = c;
962                 }
963         }
964 }
965 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
966 {
967         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
968         command->r = r;
969         command->g = g;
970         command->b = b;
971         command->a = a;
972 }
973
974 DEFCOMMAND(3, ClearDepth, float depth;)
975 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
976 {
977         int x1, y1, x2, y2, w, h, x, y;
978         int miny1, maxy1, miny2, maxy2;
979         int bandy;
980         unsigned int *p;
981         unsigned int c;
982         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
983         miny1 = thread->miny1;
984         maxy1 = thread->maxy1;
985         miny2 = thread->miny2;
986         maxy2 = thread->maxy2;
987         x1 = thread->fb_scissor[0];
988         y1 = thread->fb_scissor[1];
989         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
990         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
991         if (y1 < miny1) y1 = miny1;
992         if (y2 > maxy2) y2 = maxy2;
993         w = x2 - x1;
994         h = y2 - y1;
995         if (w < 1 || h < 1)
996                 return;
997         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
998         for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
999         for (;y < bandy;y++)
1000         {
1001                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
1002                 for (x = x1;x < x2;x++)
1003                         p[x] = c;
1004         }
1005 }
1006 void DPSOFTRAST_ClearDepth(float d)
1007 {
1008         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
1009         command->depth = d;
1010 }
1011
1012 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
1013 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
1014 {
1015         thread->colormask[0] = command->r != 0;
1016         thread->colormask[1] = command->g != 0;
1017         thread->colormask[2] = command->b != 0;
1018         thread->colormask[3] = command->a != 0;
1019         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
1020 }
1021 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1022 {
1023         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1024         command->r = r;
1025         command->g = g;
1026         command->b = b;
1027         command->a = a;
1028 }
1029
1030 DEFCOMMAND(5, DepthTest, int enable;)
1031 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1032 {
1033         thread->depthtest = command->enable;
1034         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1035 }
1036 void DPSOFTRAST_DepthTest(int enable)
1037 {
1038         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1039         command->enable = enable;
1040 }
1041
1042 DEFCOMMAND(6, ScissorTest, int enable;)
1043 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1044 {
1045         thread->scissortest = command->enable;
1046         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1047 }
1048 void DPSOFTRAST_ScissorTest(int enable)
1049 {
1050         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1051         command->enable = enable;
1052 }
1053
1054 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1055 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1056 {
1057         thread->scissor[0] = command->x;
1058         thread->scissor[1] = command->y;
1059         thread->scissor[2] = command->width;
1060         thread->scissor[3] = command->height;
1061         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1062 }
1063 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1064 {
1065         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1066         command->x = x;
1067         command->y = y;
1068         command->width = width;
1069         command->height = height;
1070 }
1071
1072 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1073 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1074 {
1075         thread->blendfunc[0] = command->sfactor;
1076         thread->blendfunc[1] = command->dfactor;
1077         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1078 }
1079 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1080 {
1081         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1082         command->sfactor = sfactor;
1083         command->dfactor = dfactor;
1084 }
1085
1086 DEFCOMMAND(9, BlendSubtract, int enable;)
1087 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1088 {
1089         thread->blendsubtract = command->enable;
1090         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1091 }
1092 void DPSOFTRAST_BlendSubtract(int enable)
1093 {
1094         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1095         command->enable = enable;
1096 }
1097
1098 DEFCOMMAND(10, DepthMask, int enable;)
1099 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1100 {
1101         thread->depthmask = command->enable;
1102 }
1103 void DPSOFTRAST_DepthMask(int enable)
1104 {
1105         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1106         command->enable = enable;
1107 }
1108
1109 DEFCOMMAND(11, DepthFunc, int func;)
1110 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1111 {
1112         thread->depthfunc = command->func;
1113 }
1114 void DPSOFTRAST_DepthFunc(int func)
1115 {
1116         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1117         command->func = func;
1118 }
1119
1120 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1121 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1122 {
1123         thread->depthrange[0] = command->nearval;
1124         thread->depthrange[1] = command->farval;
1125 }
1126 void DPSOFTRAST_DepthRange(float nearval, float farval)
1127 {
1128         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1129         command->nearval = nearval;
1130         command->farval = farval;
1131 }
1132
1133 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1134 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1135 {
1136         thread->polygonoffset[0] = command->alongnormal;
1137         thread->polygonoffset[1] = command->intoview;
1138 }
1139 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1140 {
1141         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1142         command->alongnormal = alongnormal;
1143         command->intoview = intoview;
1144 }
1145
1146 DEFCOMMAND(14, CullFace, int mode;)
1147 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1148 {
1149         thread->cullface = command->mode;
1150 }
1151 void DPSOFTRAST_CullFace(int mode)
1152 {
1153         DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1154         command->mode = mode;
1155 }
1156
1157 DEFCOMMAND(15, AlphaTest, int enable;)
1158 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1159 {
1160         thread->alphatest = command->enable;
1161 }
1162 void DPSOFTRAST_AlphaTest(int enable)
1163 {
1164         DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1165         command->enable = enable;
1166 }
1167
1168 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1169 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1170 {
1171         thread->alphafunc = command->func;
1172         thread->alphavalue = command->ref;
1173 }
1174 void DPSOFTRAST_AlphaFunc(int func, float ref)
1175 {
1176         DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1177         command->func = func;
1178         command->ref = ref;
1179 }
1180
1181 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1182 {
1183         dpsoftrast.color[0] = r;
1184         dpsoftrast.color[1] = g;
1185         dpsoftrast.color[2] = b;
1186         dpsoftrast.color[3] = a;
1187 }
1188
1189 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1190 {
1191         int outstride = blockwidth * 4;
1192         int instride = dpsoftrast.fb_width * 4;
1193         int bx1 = blockx;
1194         int by1 = blocky;
1195         int bx2 = blockx + blockwidth;
1196         int by2 = blocky + blockheight;
1197         int bw;
1198         int x;
1199         int y;
1200         unsigned char *inpixels;
1201         unsigned char *b;
1202         unsigned char *o;
1203         DPSOFTRAST_Flush();
1204         if (bx1 < 0) bx1 = 0;
1205         if (by1 < 0) by1 = 0;
1206         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1207         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1208         bw = bx2 - bx1;
1209         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1210         if (dpsoftrast.bigendian)
1211         {
1212                 for (y = by1;y < by2;y++)
1213                 {
1214                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1215                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1216                         for (x = bx1;x < bx2;x++)
1217                         {
1218                                 o[0] = b[3];
1219                                 o[1] = b[2];
1220                                 o[2] = b[1];
1221                                 o[3] = b[0];
1222                                 o += 4;
1223                                 b += 4;
1224                         }
1225                 }
1226         }
1227         else
1228         {
1229                 for (y = by1;y < by2;y++)
1230                 {
1231                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1232                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1233                         memcpy(o, b, bw*4);
1234                 }
1235         }
1236
1237 }
1238 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1239 {
1240         int tx1 = tx;
1241         int ty1 = ty;
1242         int tx2 = tx + width;
1243         int ty2 = ty + height;
1244         int sx1 = sx;
1245         int sy1 = sy;
1246         int sx2 = sx + width;
1247         int sy2 = sy + height;
1248         int swidth;
1249         int sheight;
1250         int twidth;
1251         int theight;
1252         int sw;
1253         int sh;
1254         int tw;
1255         int th;
1256         int y;
1257         unsigned int *spixels;
1258         unsigned int *tpixels;
1259         DPSOFTRAST_Texture *texture;
1260         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1261         if (mip < 0 || mip >= texture->mipmaps) return;
1262         DPSOFTRAST_Flush();
1263         spixels = dpsoftrast.fb_colorpixels[0];
1264         swidth = dpsoftrast.fb_width;
1265         sheight = dpsoftrast.fb_height;
1266         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1267         twidth = texture->mipmap[mip][2];
1268         theight = texture->mipmap[mip][3];
1269         if (tx1 < 0) tx1 = 0;
1270         if (ty1 < 0) ty1 = 0;
1271         if (tx2 > twidth) tx2 = twidth;
1272         if (ty2 > theight) ty2 = theight;
1273         if (sx1 < 0) sx1 = 0;
1274         if (sy1 < 0) sy1 = 0;
1275         if (sx2 > swidth) sx2 = swidth;
1276         if (sy2 > sheight) sy2 = sheight;
1277         tw = tx2 - tx1;
1278         th = ty2 - ty1;
1279         sw = sx2 - sx1;
1280         sh = sy2 - sy1;
1281         if (tw > sw) tw = sw;
1282         if (th > sh) th = sh;
1283         if (tw < 1 || th < 1)
1284                 return;
1285         sy1 = sheight - 1 - sy1;
1286         for (y = 0;y < th;y++)
1287                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1288         if (texture->mipmaps > 1)
1289                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1290 }
1291
1292 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1293 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1294 {
1295         if (thread->texbound[command->unitnum])
1296                 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1297         thread->texbound[command->unitnum] = command->texture;
1298 }
1299 void DPSOFTRAST_SetTexture(int unitnum, int index)
1300 {
1301         DPSOFTRAST_Command_SetTexture *command;
1302         DPSOFTRAST_Texture *texture;
1303         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1304         {
1305                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1306                 return;
1307         }
1308         texture = DPSOFTRAST_Texture_GetByIndex(index);
1309         if (index && !texture)
1310         {
1311                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1312                 return;
1313         }
1314
1315         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1316         command->unitnum = unitnum;
1317         command->texture = texture;
1318
1319         dpsoftrast.texbound[unitnum] = texture;
1320         ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1321 }
1322
1323 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1324 {
1325         dpsoftrast.pointer_vertex3f = vertex3f;
1326         dpsoftrast.stride_vertex = stride;
1327 }
1328 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1329 {
1330         dpsoftrast.pointer_color4f = color4f;
1331         dpsoftrast.pointer_color4ub = NULL;
1332         dpsoftrast.stride_color = stride;
1333 }
1334 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1335 {
1336         dpsoftrast.pointer_color4f = NULL;
1337         dpsoftrast.pointer_color4ub = color4ub;
1338         dpsoftrast.stride_color = stride;
1339 }
1340 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1341 {
1342         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1343         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1344         dpsoftrast.stride_texcoord[unitnum] = stride;
1345 }
1346
1347 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1348 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1349 {
1350         thread->shader_mode = command->mode;
1351         thread->shader_permutation = command->permutation;
1352         thread->shader_exactspecularmath = command->exactspecularmath;
1353 }
1354 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1355 {
1356         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1357         command->mode = mode;
1358         command->permutation = permutation;
1359         command->exactspecularmath = exactspecularmath;
1360
1361         dpsoftrast.shader_mode = mode;
1362         dpsoftrast.shader_permutation = permutation;
1363         dpsoftrast.shader_exactspecularmath = exactspecularmath;
1364 }
1365
1366 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1367 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1368 {
1369         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1370 }
1371 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1372 {
1373         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1374         command->index = index;
1375         command->val[0] = v0;
1376         command->val[1] = v1;
1377         command->val[2] = v2;
1378         command->val[3] = v3;
1379
1380         dpsoftrast.uniform4f[index*4+0] = v0;
1381         dpsoftrast.uniform4f[index*4+1] = v1;
1382         dpsoftrast.uniform4f[index*4+2] = v2;
1383         dpsoftrast.uniform4f[index*4+3] = v3;
1384 }
1385 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1386 {
1387         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1388         command->index = index;
1389         memcpy(command->val, v, sizeof(command->val));
1390
1391         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1392 }
1393
1394 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1395 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1396 {
1397         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1398 }
1399 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1400 {
1401 #ifdef SSE_POSSIBLE
1402         int i, index;
1403         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1404         {
1405                 __m128 m0, m1, m2, m3;
1406                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1407                 command->index = (DPSOFTRAST_UNIFORM)index;
1408                 if (((size_t)v)&(ALIGN_SIZE-1))
1409                 {
1410                         m0 = _mm_loadu_ps(v);
1411                         m1 = _mm_loadu_ps(v+4);
1412                         m2 = _mm_loadu_ps(v+8);
1413                         m3 = _mm_loadu_ps(v+12);
1414                 }
1415                 else
1416                 {
1417                         m0 = _mm_load_ps(v);
1418                         m1 = _mm_load_ps(v+4);
1419                         m2 = _mm_load_ps(v+8);
1420                         m3 = _mm_load_ps(v+12);
1421                 }
1422                 if (transpose)
1423                 {
1424                         __m128 t0, t1, t2, t3;
1425                         t0 = _mm_unpacklo_ps(m0, m1);
1426                         t1 = _mm_unpacklo_ps(m2, m3);
1427                         t2 = _mm_unpackhi_ps(m0, m1);
1428                         t3 = _mm_unpackhi_ps(m2, m3);
1429                         m0 = _mm_movelh_ps(t0, t1);
1430                         m1 = _mm_movehl_ps(t1, t0);
1431                         m2 = _mm_movelh_ps(t2, t3);
1432                         m3 = _mm_movehl_ps(t3, t2);                     
1433                 }
1434                 _mm_store_ps(command->val, m0);
1435                 _mm_store_ps(command->val+4, m1);
1436                 _mm_store_ps(command->val+8, m2);
1437                 _mm_store_ps(command->val+12, m3);
1438                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1439                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1440                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1441                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1442         }
1443 #endif
1444 }
1445
1446 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1447 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1448 {
1449         thread->uniform1i[command->index] = command->val;
1450 }
1451 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1452 {
1453         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1454         command->index = index;
1455         command->val = i0;
1456
1457         dpsoftrast.uniform1i[command->index] = i0;
1458 }
1459
1460 DEFCOMMAND(24, ClipPlane, float clipplane[4];)
1461 static void DPSOFTRAST_Interpret_ClipPlane(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClipPlane *command)
1462 {
1463         memcpy(thread->clipplane, command->clipplane, 4*sizeof(float));
1464         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1465 }
1466 void DPSOFTRAST_ClipPlane(float x, float y, float z, float w)
1467 {
1468         DPSOFTRAST_Command_ClipPlane *command = DPSOFTRAST_ALLOCATECOMMAND(ClipPlane);
1469         command->clipplane[0] = x;
1470         command->clipplane[1] = y;
1471         command->clipplane[2] = z;
1472         command->clipplane[3] = w;
1473 }
1474
1475 #ifdef SSE_POSSIBLE
1476 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1477 {
1478         float *end = dst + size*4;
1479         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1480         {
1481                 while (dst < end)
1482                 {
1483                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1484                         dst += 4;
1485                         src += stride;
1486                 }
1487         }
1488         else
1489         {
1490                 while (dst < end)
1491                 {
1492                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1493                         dst += 4;
1494                         src += stride;
1495                 }
1496         }
1497 }
1498
1499 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1500 {
1501         float *end = dst + size*4;
1502         if (stride == sizeof(float[3]))
1503         {
1504                 float *end4 = dst + (size&~3)*4;        
1505                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1506                 {
1507                         while (dst < end4)
1508                         {
1509                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1510                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1511                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1512                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1513                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1514                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1515                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1516                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1517                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1518                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1519                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1520                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1521                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1522                                 dst += 16;
1523                                 src += 4*sizeof(float[3]);
1524                         }
1525                 }
1526                 else
1527                 {
1528                         while (dst < end4)
1529                         {
1530                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1531                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1532                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1533                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1534                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1535                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1536                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1537                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1538                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1539                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1540                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1541                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1542                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1543                                 dst += 16;
1544                                 src += 4*sizeof(float[3]);
1545                         }
1546                 }
1547         }
1548         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1549         {
1550                 while (dst < end)
1551                 {
1552                         __m128 v = _mm_loadu_ps((const float *)src);
1553                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1554                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1555                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1556                         _mm_store_ps(dst, v);
1557                         dst += 4;
1558                         src += stride;
1559                 }
1560         }
1561         else
1562         {
1563                 while (dst < end)
1564                 {
1565                         __m128 v = _mm_load_ps((const float *)src);
1566                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1567                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1568                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1569                         _mm_store_ps(dst, v);
1570                         dst += 4;
1571                         src += stride;
1572                 }
1573         }
1574 }
1575
1576 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1577 {
1578         float *end = dst + size*4;
1579         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1580         if (stride == sizeof(float[2]))
1581         {
1582                 float *end2 = dst + (size&~1)*4;
1583                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1584                 {
1585                         while (dst < end2)
1586                         {
1587                                 __m128 v = _mm_loadu_ps((const float *)src);
1588                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1589                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1590                                 dst += 8;
1591                                 src += 2*sizeof(float[2]);
1592                         }
1593                 }
1594                 else
1595                 {
1596                         while (dst < end2)
1597                         {
1598                                 __m128 v = _mm_load_ps((const float *)src);
1599                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1600                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1601                                 dst += 8;
1602                                 src += 2*sizeof(float[2]);
1603                         }
1604                 }
1605         }
1606         while (dst < end)
1607         {
1608                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1609                 dst += 4;
1610                 src += stride;
1611         }
1612 }
1613
1614 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1615 {
1616         float *end = dst + size*4;
1617         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1618         if (stride == sizeof(unsigned char[4]))
1619         {
1620                 float *end4 = dst + (size&~3)*4;
1621                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1622                 {
1623                         while (dst < end4)
1624                         {
1625                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1626                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1627                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1628                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1629                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1630                                 dst += 16;
1631                                 src += 4*sizeof(unsigned char[4]);
1632                         }
1633                 }
1634                 else
1635                 {
1636                         while (dst < end4)
1637                         {
1638                                 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1639                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1640                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1641                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1642                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1643                                 dst += 16;
1644                                 src += 4*sizeof(unsigned char[4]);
1645                         }
1646                 }
1647         }
1648         while (dst < end)
1649         {
1650                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1651                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1652                 dst += 4;
1653                 src += stride;
1654         }
1655 }
1656
1657 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1658 {
1659         float *end = dst + 4*size;
1660         __m128 v = _mm_loadu_ps(src);
1661         while (dst < end)
1662         {
1663                 _mm_store_ps(dst, v);
1664                 dst += 4;
1665         }
1666 }
1667 #endif
1668
1669 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1670 {
1671 #ifdef SSE_POSSIBLE
1672         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1673         __m128 m0, m1, m2, m3;
1674         float *end;
1675         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1676         {
1677                 // fast case for identity matrix
1678                 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1679                 return;
1680         }
1681         end = out4f + numitems*4;
1682         m0 = _mm_loadu_ps(inmatrix16f);
1683         m1 = _mm_loadu_ps(inmatrix16f + 4);
1684         m2 = _mm_loadu_ps(inmatrix16f + 8);
1685         m3 = _mm_loadu_ps(inmatrix16f + 12);
1686         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1687         {
1688                 while (out4f < end)
1689                 {
1690                         __m128 v = _mm_loadu_ps(in4f);
1691                         _mm_store_ps(out4f,
1692                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1693                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1694                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1695                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1696                         out4f += 4;
1697                         in4f += 4;
1698                 }
1699         }
1700         else
1701         {
1702                 while (out4f < end)
1703                 {
1704                         __m128 v = _mm_load_ps(in4f);
1705                         _mm_store_ps(out4f,
1706                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1707                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1708                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1709                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1710                         out4f += 4;
1711                         in4f += 4;
1712                 }
1713         }
1714 #endif
1715 }
1716
1717 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1718 {
1719         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1720 }
1721
1722 #ifdef SSE_POSSIBLE
1723 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1724 { \
1725         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1726         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1727         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1728         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1729 }
1730
1731 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1732 { \
1733         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1734         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1735         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1736         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1737 }
1738
1739 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1740 { \
1741         __m128 p = (in); \
1742         out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1743                                                   _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1744                                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1745                                                                                         _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1746 }
1747
1748 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1749 {
1750         int clipmask = 0xFF;
1751         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1752         __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1753         __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1754         __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1755         m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1756         m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1757         m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1758         m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1759         #define BBFRONT(k, pos) \
1760         { \
1761                 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1762                 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1763                 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1764                 { \
1765                         __m128 proj; \
1766                         clipmask &= ~(1<<k); \
1767                         proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1768                         minproj = _mm_min_ss(minproj, proj); \
1769                         maxproj = _mm_max_ss(maxproj, proj); \
1770                 } \
1771         }
1772         BBFRONT(0, minpos); 
1773         BBFRONT(1, _mm_move_ss(minpos, maxpos)); 
1774         BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1775         BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1776         BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1777         BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1778         BBFRONT(6, _mm_move_ss(maxpos, minpos)); 
1779         BBFRONT(7, maxpos);
1780         #define BBCLIP(k) \
1781         { \
1782                 if (clipmask&(1<<k)) \
1783                 { \
1784                         if (!(clipmask&(1<<(k^1)))) \
1785                         { \
1786                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1787                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1788                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1789                                 minproj = _mm_min_ss(minproj, proj); \
1790                                 maxproj = _mm_max_ss(maxproj, proj); \
1791                         } \
1792                         if (!(clipmask&(1<<(k^2)))) \
1793                         { \
1794                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1795                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1796                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1797                                 minproj = _mm_min_ss(minproj, proj); \
1798                                 maxproj = _mm_max_ss(maxproj, proj); \
1799                         } \
1800                         if (!(clipmask&(1<<(k^4)))) \
1801                         { \
1802                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1803                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1804                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1805                                 minproj = _mm_min_ss(minproj, proj); \
1806                                 maxproj = _mm_max_ss(maxproj, proj); \
1807                         } \
1808                 } \
1809         }
1810         BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1811         viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1812         viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1813         minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1814         maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1815         minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1816         maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1817         *starty = _mm_cvttss_si32(maxproj);
1818         *endy = _mm_cvttss_si32(minproj)+1;
1819         return clipmask;
1820 }
1821         
1822 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1823 {
1824         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1825         float *end = out4f + numitems*4;
1826         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1827         __m128 minpos, maxpos;
1828         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1829         {
1830                 minpos = maxpos = _mm_loadu_ps(in4f);
1831                 while (out4f < end)
1832                 {
1833                         __m128 v = _mm_loadu_ps(in4f);
1834                         minpos = _mm_min_ps(minpos, v);
1835                         maxpos = _mm_max_ps(maxpos, v);
1836                         _mm_store_ps(out4f, v);
1837                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1838                         _mm_store_ps(screen4f, v);
1839                         in4f += 4;
1840                         out4f += 4;
1841                         screen4f += 4;
1842                 }
1843         }
1844         else
1845         {
1846                 minpos = maxpos = _mm_load_ps(in4f);
1847                 while (out4f < end)
1848                 {
1849                         __m128 v = _mm_load_ps(in4f);
1850                         minpos = _mm_min_ps(minpos, v);
1851                         maxpos = _mm_max_ps(maxpos, v);
1852                         _mm_store_ps(out4f, v);
1853                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1854                         _mm_store_ps(screen4f, v);
1855                         in4f += 4;
1856                         out4f += 4;
1857                         screen4f += 4;
1858                 }
1859         }
1860         if (starty && endy) 
1861         {
1862                 ALIGN(float minposf[4]);
1863                 ALIGN(float maxposf[4]);
1864                 _mm_store_ps(minposf, minpos);
1865                 _mm_store_ps(maxposf, maxpos);
1866                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1867         }
1868         return 0;
1869 }
1870
1871 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1872 {
1873         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1874         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1875         float *end;
1876         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1877                 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1878         end = out4f + numitems*4;
1879         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1880         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1881         m0 = _mm_loadu_ps(inmatrix16f);
1882         m1 = _mm_loadu_ps(inmatrix16f + 4);
1883         m2 = _mm_loadu_ps(inmatrix16f + 8);
1884         m3 = _mm_loadu_ps(inmatrix16f + 12);
1885         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1886         {
1887                 minpos = maxpos = _mm_loadu_ps(in4f);
1888                 while (out4f < end)
1889                 {
1890                         __m128 v = _mm_loadu_ps(in4f);
1891                         minpos = _mm_min_ps(minpos, v);
1892                         maxpos = _mm_max_ps(maxpos, v);
1893                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1894                         _mm_store_ps(out4f, v);
1895                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1896                         _mm_store_ps(screen4f, v);
1897                         in4f += 4;
1898                         out4f += 4;
1899                         screen4f += 4;
1900                 }
1901         }
1902         else
1903         {
1904                 minpos = maxpos = _mm_load_ps(in4f);
1905                 while (out4f < end)
1906                 {
1907                         __m128 v = _mm_load_ps(in4f);
1908                         minpos = _mm_min_ps(minpos, v);
1909                         maxpos = _mm_max_ps(maxpos, v);
1910                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1911                         _mm_store_ps(out4f, v);
1912                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1913                         _mm_store_ps(screen4f, v);
1914                         in4f += 4;
1915                         out4f += 4;
1916                         screen4f += 4;
1917                 }
1918         }
1919         if (starty && endy) 
1920         {
1921                 ALIGN(float minposf[4]);
1922                 ALIGN(float maxposf[4]);
1923                 _mm_store_ps(minposf, minpos);
1924                 _mm_store_ps(maxposf, maxpos);
1925                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f); 
1926         }
1927         return 0;
1928 }
1929 #endif
1930
1931 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1932 {
1933 #ifdef SSE_POSSIBLE
1934         float *outf = dpsoftrast.post_array4f[outarray];
1935         const unsigned char *inb;
1936         int firstvertex = dpsoftrast.firstvertex;
1937         int numvertices = dpsoftrast.numvertices;
1938         int stride;
1939         switch(inarray)
1940         {
1941         case DPSOFTRAST_ARRAY_POSITION:
1942                 stride = dpsoftrast.stride_vertex;
1943                 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1944                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1945                 break;
1946         case DPSOFTRAST_ARRAY_COLOR:
1947                 stride = dpsoftrast.stride_color;
1948                 if (dpsoftrast.pointer_color4f)
1949                 {
1950                         inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1951                         DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1952                 }
1953                 else if (dpsoftrast.pointer_color4ub)
1954                 {
1955                         stride = dpsoftrast.stride_color;
1956                         inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1957                         DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1958                 }
1959                 else
1960                 {
1961                         DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1962                 }
1963                 break;
1964         default:
1965                 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1966                 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1967                 {
1968                         inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1969                         switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1970                         {
1971                         case 2:
1972                                 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1973                                 break;
1974                         case 3:
1975                                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1976                                 break;
1977                         case 4:
1978                                 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1979                                 break;
1980                         }
1981                 }
1982                 break;
1983         }
1984         return outf;
1985 #else
1986         return NULL;
1987 #endif
1988 }
1989
1990 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1991 {
1992         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1993         DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1994         return data;
1995 }
1996
1997 #if 0
1998 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1999 {
2000 #ifdef SSE_POSSIBLE
2001         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2002         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
2003         return data;
2004 #else
2005         return NULL;
2006 #endif
2007 }
2008 #endif
2009
2010 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
2011 {
2012 #ifdef SSE_POSSIBLE
2013         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2014         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
2015         return data;
2016 #else
2017         return NULL;
2018 #endif
2019 }
2020
2021 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
2022 {
2023         int x;
2024         int startx = span->startx;
2025         int endx = span->endx;
2026         float wslope = triangle->w[0];
2027         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
2028         float endz = 1.0f / (w + wslope * startx);
2029         if (triangle->w[0] == 0)
2030         {
2031                 // LordHavoc: fast flat polygons (HUD/menu)
2032                 for (x = startx;x < endx;x++)
2033                         zf[x] = endz;
2034                 return;
2035         }
2036         for (x = startx;x < endx;)
2037         {
2038                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2039                 float z = endz, dz;
2040                 if (nextsub >= endx) nextsub = endsub = endx-1;
2041                 endz = 1.0f / (w + wslope * nextsub);
2042                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
2043                 for (; x <= endsub; x++, z += dz)
2044                         zf[x] = z;
2045         }
2046 }
2047
2048 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2049 {
2050 #ifdef SSE_POSSIBLE
2051         int x;
2052         int startx = span->startx;
2053         int endx = span->endx;
2054         int maskx;
2055         int subx;
2056         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2057         unsigned char * RESTRICT pixelmask = span->pixelmask;
2058         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2059         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2060         if (!pixel)
2061                 return;
2062         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2063         pixeli += span->y * dpsoftrast.fb_width + span->x;
2064         // handle alphatest now (this affects depth writes too)
2065         if (thread->alphatest)
2066                 for (x = startx;x < endx;x++)
2067                         if (in4ub[x*4+3] < 128)
2068                                 pixelmask[x] = false;
2069         // LordHavoc: clear pixelmask for some pixels in alphablend cases, this
2070         // helps sprites, text and hud artwork
2071         switch(thread->fb_blendmode)
2072         {
2073         case DPSOFTRAST_BLENDMODE_ALPHA:
2074         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2075         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2076                 maskx = startx;
2077                 for (x = startx;x < endx;x++)
2078                 {
2079                         if (in4ub[x*4+3] >= 1)
2080                         {
2081                                 startx = x;
2082                                 for (;;)
2083                                 {
2084                                         while (++x < endx && in4ub[x*4+3] >= 1) ;
2085                                         maskx = x;
2086                                         if (x >= endx) break;
2087                                         ++x;
2088                                         while (++x < endx && in4ub[x*4+3] < 1) pixelmask[x] = false;
2089                                         if (x >= endx) break;
2090                                 }
2091                                 break;
2092                         }
2093                 }
2094                 endx = maskx;
2095                 break;
2096         case DPSOFTRAST_BLENDMODE_OPAQUE:
2097         case DPSOFTRAST_BLENDMODE_ADD:
2098         case DPSOFTRAST_BLENDMODE_INVMOD:
2099         case DPSOFTRAST_BLENDMODE_MUL:
2100         case DPSOFTRAST_BLENDMODE_MUL2:
2101         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2102         case DPSOFTRAST_BLENDMODE_INVADD:
2103                 break;
2104         }
2105         // put some special values at the end of the mask to ensure the loops end
2106         pixelmask[endx] = 1;
2107         pixelmask[endx+1] = 0;
2108         // LordHavoc: use a double loop to identify subspans, this helps the
2109         // optimized copy/blend loops to perform at their best, most triangles
2110         // have only one run of pixels, and do the search using wide reads...
2111         x = startx;
2112         while (x < endx)
2113         {
2114                 // if this pixel is masked off, it's probably not alone...
2115                 if (!pixelmask[x])
2116                 {
2117                         x++;
2118 #if 1
2119                         if (x + 8 < endx)
2120                         {
2121                                 // the 4-item search must be aligned or else it stalls badly
2122                                 if ((x & 3) && !pixelmask[x]) 
2123                                 {
2124                                         if(pixelmask[x]) goto endmasked;
2125                                         x++;
2126                                         if (x & 3)
2127                                         {
2128                                                 if(pixelmask[x]) goto endmasked;
2129                                                 x++;
2130                                                 if (x & 3)
2131                                                 {
2132                                                         if(pixelmask[x]) goto endmasked;
2133                                                         x++;
2134                                                 }
2135                                         }
2136                                 }
2137                                 while (*(unsigned int *)&pixelmask[x] == 0x00000000)
2138                                         x += 4;
2139                         }
2140 #endif
2141                         for (;!pixelmask[x];x++)
2142                                 ;
2143                         // rather than continue the loop, just check the end variable
2144                         if (x >= endx)
2145                                 break;
2146                 }
2147         endmasked:
2148                 // find length of subspan
2149                 subx = x + 1;
2150 #if 1
2151                 if (subx + 8 < endx)
2152                 {
2153                         if (subx & 3)
2154                         {
2155                                 if(!pixelmask[subx]) goto endunmasked;
2156                                 subx++;
2157                                 if (subx & 3)
2158                                 {
2159                                         if(!pixelmask[subx]) goto endunmasked;
2160                                         subx++;
2161                                         if (subx & 3)
2162                                         {
2163                                                 if(!pixelmask[subx]) goto endunmasked;
2164                                                 subx++;
2165                                         }
2166                                 }
2167                         }
2168                         while (*(unsigned int *)&pixelmask[subx] == 0x01010101)
2169                                 subx += 4;
2170                 }
2171 #endif
2172                 for (;pixelmask[subx];subx++)
2173                         ;
2174                 // the checks can overshoot, so make sure to clip it...
2175                 if (subx > endx)
2176                         subx = endx;
2177         endunmasked:
2178                 // now that we know the subspan length...  process!
2179                 switch(thread->fb_blendmode)
2180                 {
2181                 case DPSOFTRAST_BLENDMODE_OPAQUE:
2182 #if 0
2183                         if (subx - x >= 16)
2184                         {
2185                                 memcpy(pixeli + x, ini + x, (subx - x) * sizeof(pixeli[x]));
2186                                 x = subx;
2187                         }
2188                         else
2189 #elif 1
2190                         while (x + 16 <= subx)
2191                         {
2192                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2193                                 _mm_storeu_si128((__m128i *)&pixeli[x+4], _mm_loadu_si128((const __m128i *)&ini[x+4]));
2194                                 _mm_storeu_si128((__m128i *)&pixeli[x+8], _mm_loadu_si128((const __m128i *)&ini[x+8]));
2195                                 _mm_storeu_si128((__m128i *)&pixeli[x+12], _mm_loadu_si128((const __m128i *)&ini[x+12]));
2196                                 x += 16;
2197                         }
2198 #endif
2199                         {
2200                                 while (x + 4 <= subx)
2201                                 {
2202                                         _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2203                                         x += 4;
2204                                 }
2205                                 if (x + 2 <= subx)
2206                                 {
2207                                         pixeli[x] = ini[x];
2208                                         pixeli[x+1] = ini[x+1];
2209                                         x += 2;
2210                                 }
2211                                 if (x < subx)
2212                                 {
2213                                         pixeli[x] = ini[x];
2214                                         x++;
2215                                 }
2216                         }
2217                         break;
2218                 case DPSOFTRAST_BLENDMODE_ALPHA:
2219                 #define FINISHBLEND(blend2, blend1) \
2220                         for (;x + 1 < subx;x += 2) \
2221                         { \
2222                                 __m128i src, dst; \
2223                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2224                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2225                                 blend2; \
2226                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2227                         } \
2228                         if (x < subx) \
2229                         { \
2230                                 __m128i src, dst; \
2231                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2232                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2233                                 blend1; \
2234                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2235                                 x++; \
2236                         }
2237                         FINISHBLEND({
2238                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2239                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2240                         }, {
2241                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2242                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2243                         });
2244                         break;
2245                 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2246                         FINISHBLEND({
2247                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2248                                 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2249                         }, {
2250                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2251                                 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2252                         });
2253                         break;
2254                 case DPSOFTRAST_BLENDMODE_ADD:
2255                         FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2256                         break;
2257                 case DPSOFTRAST_BLENDMODE_INVMOD:
2258                         FINISHBLEND({
2259                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2260                         }, {
2261                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2262                         });
2263                         break;
2264                 case DPSOFTRAST_BLENDMODE_MUL:
2265                         FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2266                         break;
2267                 case DPSOFTRAST_BLENDMODE_MUL2:
2268                         FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2269                         break;
2270                 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2271                         FINISHBLEND({
2272                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2273                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2274                         }, {
2275                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2276                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2277                         });
2278                         break;
2279                 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2280                         FINISHBLEND({
2281                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2282                                 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2283                         }, {
2284                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2285                                 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2286                         });
2287                         break;
2288                 case DPSOFTRAST_BLENDMODE_INVADD:
2289                         FINISHBLEND({
2290                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2291                         }, {
2292                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2293                         });
2294                         break;
2295                 }
2296         }
2297 #endif
2298 }
2299
2300 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2301 {
2302         int x;
2303         int startx = span->startx;
2304         int endx = span->endx;
2305         int flags;
2306         float c[4];
2307         float data[4];
2308         float slope[4];
2309         float tc[2], endtc[2];
2310         float tcscale[2];
2311         unsigned int tci[2];
2312         unsigned int tci1[2];
2313         unsigned int tcimin[2];
2314         unsigned int tcimax[2];
2315         int tciwrapmask[2];
2316         int tciwidth;
2317         int filter;
2318         int mip;
2319         const unsigned char * RESTRICT pixelbase;
2320         const unsigned char * RESTRICT pixel[4];
2321         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2322         // if no texture is bound, just fill it with white
2323         if (!texture)
2324         {
2325                 for (x = startx;x < endx;x++)
2326                 {
2327                         out4f[x*4+0] = 1.0f;
2328                         out4f[x*4+1] = 1.0f;
2329                         out4f[x*4+2] = 1.0f;
2330                         out4f[x*4+3] = 1.0f;
2331                 }
2332                 return;
2333         }
2334         mip = triangle->mip[texunitindex];
2335         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2336         // if this mipmap of the texture is 1 pixel, just fill it with that color
2337         if (texture->mipmap[mip][1] == 4)
2338         {
2339                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2340                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2341                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2342                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2343                 for (x = startx;x < endx;x++)
2344                 {
2345                         out4f[x*4+0] = c[0];
2346                         out4f[x*4+1] = c[1];
2347                         out4f[x*4+2] = c[2];
2348                         out4f[x*4+3] = c[3];
2349                 }
2350                 return;
2351         }
2352         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2353         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2354         flags = texture->flags;
2355         tcscale[0] = texture->mipmap[mip][2];
2356         tcscale[1] = texture->mipmap[mip][3];
2357         tciwidth = texture->mipmap[mip][2];
2358         tcimin[0] = 0;
2359         tcimin[1] = 0;
2360         tcimax[0] = texture->mipmap[mip][2]-1;
2361         tcimax[1] = texture->mipmap[mip][3]-1;
2362         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2363         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2364         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
2365         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
2366         if (filter)
2367         {
2368                 endtc[0] -= 0.5f;
2369                 endtc[1] -= 0.5f;
2370         }
2371         for (x = startx;x < endx;)
2372         {
2373                 unsigned int subtc[2];
2374                 unsigned int substep[2];
2375                 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2376                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2377                 if (nextsub >= endx)
2378                 {
2379                         nextsub = endsub = endx-1;      
2380                         if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2381                 }
2382                 tc[0] = endtc[0];
2383                 tc[1] = endtc[1];
2384                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2385                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2386                 if (filter)
2387                 {
2388                         endtc[0] -= 0.5f;
2389                         endtc[1] -= 0.5f;
2390                 }
2391                 substep[0] = (endtc[0] - tc[0]) * subscale;
2392                 substep[1] = (endtc[1] - tc[1]) * subscale;
2393                 subtc[0] = tc[0] * (1<<12);
2394                 subtc[1] = tc[1] * (1<<12);
2395                 if (filter)
2396                 {
2397                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2398                         {
2399                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2400                                 {
2401                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2402                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2403                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2404                                         tci[0] = subtc[0]>>12;
2405                                         tci[1] = subtc[1]>>12;
2406                                         tci1[0] = tci[0] + 1;
2407                                         tci1[1] = tci[1] + 1;
2408                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2409                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2410                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2411                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2412                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2413                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2414                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2415                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2416                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2417                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2418                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2419                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2420                                         out4f[x*4+0] = c[0];
2421                                         out4f[x*4+1] = c[1];
2422                                         out4f[x*4+2] = c[2];
2423                                         out4f[x*4+3] = c[3];
2424                                 }
2425                         }
2426                         else
2427                         {
2428                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2429                                 {
2430                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2431                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2432                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2433                                         tci[0] = subtc[0]>>12;
2434                                         tci[1] = subtc[1]>>12;
2435                                         tci1[0] = tci[0] + 1;
2436                                         tci1[1] = tci[1] + 1;
2437                                         tci[0] &= tciwrapmask[0];
2438                                         tci[1] &= tciwrapmask[1];
2439                                         tci1[0] &= tciwrapmask[0];
2440                                         tci1[1] &= tciwrapmask[1];
2441                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2442                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2443                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2444                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2445                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2446                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2447                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2448                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2449                                         out4f[x*4+0] = c[0];
2450                                         out4f[x*4+1] = c[1];
2451                                         out4f[x*4+2] = c[2];
2452                                         out4f[x*4+3] = c[3];
2453                                 }
2454                         }
2455                 }
2456                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2457                 {
2458                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2459                         {
2460                                 tci[0] = subtc[0]>>12;
2461                                 tci[1] = subtc[1]>>12;
2462                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2463                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2464                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2465                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2466                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2467                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2468                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2469                                 out4f[x*4+0] = c[0];
2470                                 out4f[x*4+1] = c[1];
2471                                 out4f[x*4+2] = c[2];
2472                                 out4f[x*4+3] = c[3];
2473                         }
2474                 }
2475                 else
2476                 {
2477                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2478                         {
2479                                 tci[0] = subtc[0]>>12;
2480                                 tci[1] = subtc[1]>>12;
2481                                 tci[0] &= tciwrapmask[0];
2482                                 tci[1] &= tciwrapmask[1];
2483                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2484                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2485                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2486                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2487                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2488                                 out4f[x*4+0] = c[0];
2489                                 out4f[x*4+1] = c[1];
2490                                 out4f[x*4+2] = c[2];
2491                                 out4f[x*4+3] = c[3];
2492                         }
2493                 }
2494         }
2495 }
2496
2497 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2498 {
2499 #ifdef SSE_POSSIBLE
2500         int x;
2501         int startx = span->startx;
2502         int endx = span->endx;
2503         int flags;
2504         __m128 data, slope, tcscale;
2505         __m128i tcsize, tcmask, tcoffset, tcmax;
2506         __m128 tc, endtc;
2507         __m128i subtc, substep, endsubtc;
2508         int filter;
2509         int mip;
2510         int affine; // LordHavoc: optimized affine texturing case
2511         unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2512         const unsigned char * RESTRICT pixelbase;
2513         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2514         // if no texture is bound, just fill it with white
2515         if (!texture)
2516         {
2517                 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2518                 return;
2519         }
2520         mip = triangle->mip[texunitindex];
2521         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2522         // if this mipmap of the texture is 1 pixel, just fill it with that color
2523         if (texture->mipmap[mip][1] == 4)
2524         {
2525                 unsigned int k = *((const unsigned int *)pixelbase);
2526                 for (x = startx;x < endx;x++)
2527                         outi[x] = k;
2528                 return;
2529         }
2530         affine = zf[startx] == zf[endx-1];
2531         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2532         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2533         flags = texture->flags;
2534         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2535         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2536         tcscale = _mm_cvtepi32_ps(tcsize);
2537         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2538         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2539         endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2540         if (filter)
2541                 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2542         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2543         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2544         tcmax = _mm_packs_epi32(tcmask, tcmask);
2545         for (x = startx;x < endx;)
2546         {
2547                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2548                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2549                 if (nextsub >= endx || affine)
2550                 {
2551                         nextsub = endsub = endx-1;
2552                         if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2553                 }       
2554                 tc = endtc;
2555                 subtc = endsubtc;
2556                 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2557                 if (filter)
2558                         endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2559                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2560                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2561                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2562                 substep = _mm_slli_epi32(substep, 1);
2563                 if (filter)
2564                 {
2565                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2566                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2567                         {
2568                                 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2569                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2570                                 {
2571                                         const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;                   
2572                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2573                                         tci = _mm_madd_epi16(tci, tcoffset);
2574                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2575                                         ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2576                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2577                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2578                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2579                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2580                                         fracm = _mm_srli_epi16(subtc, 1);
2581                                         pix1 = _mm_add_epi16(pix1,
2582                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2583                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2584                                         pix3 = _mm_add_epi16(pix3,
2585                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2586                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2587                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2588                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2589                                         pix2 = _mm_add_epi16(pix2,
2590                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2591                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2592                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2593                                 }
2594                                 if (x <= endsub)
2595                                 {
2596                                         const unsigned char * RESTRICT ptr1;
2597                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2598                                         tci = _mm_madd_epi16(tci, tcoffset);
2599                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2600                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2601                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2602                                         fracm = _mm_srli_epi16(subtc, 1);
2603                                         pix1 = _mm_add_epi16(pix1,
2604                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2605                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2606                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2607                                         pix1 = _mm_add_epi16(pix1,
2608                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2609                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2610                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2611                                         x++;
2612                                 }
2613                         }
2614                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2615                         {
2616                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2617                                 {
2618                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2619                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2620                                         tci = _mm_madd_epi16(tci, tcoffset);
2621                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2622                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2623                                                                                         _mm_setzero_si128());
2624                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2625                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2626                                                                                         _mm_setzero_si128());
2627                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2628                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2629                                         tci = _mm_madd_epi16(tci, tcoffset);
2630                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2631                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2632                                                                                         _mm_setzero_si128());
2633                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2634                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2635                                                                                         _mm_setzero_si128());
2636                                         fracm = _mm_srli_epi16(subtc, 1);
2637                                         pix1 = _mm_add_epi16(pix1,
2638                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2639                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2640                                         pix3 = _mm_add_epi16(pix3,
2641                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2642                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2643                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2644                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2645                                         pix2 = _mm_add_epi16(pix2,
2646                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2647                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2648                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2649                                 }
2650                                 if (x <= endsub)
2651                                 {
2652                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2653                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2654                                         tci = _mm_madd_epi16(tci, tcoffset);
2655                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2656                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2657                                                                                         _mm_setzero_si128());
2658                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2659                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2660                                                                                         _mm_setzero_si128());
2661                                         fracm = _mm_srli_epi16(subtc, 1);
2662                                         pix1 = _mm_add_epi16(pix1,
2663                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2664                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2665                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2666                                         pix1 = _mm_add_epi16(pix1,
2667                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2668                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2669                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2670                                         x++;
2671                                 }
2672                         }
2673                         else
2674                         {
2675                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2676                                 {
2677                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2678                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2679                                         tci = _mm_madd_epi16(tci, tcoffset);
2680                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2681                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2682                                                                                         _mm_setzero_si128());
2683                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2684                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2685                                                                                         _mm_setzero_si128());
2686                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2687                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2688                                         tci = _mm_madd_epi16(tci, tcoffset);
2689                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2690                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2691                                                                                         _mm_setzero_si128());
2692                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2693                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2694                                                                                         _mm_setzero_si128());
2695                                         fracm = _mm_srli_epi16(subtc, 1);
2696                                         pix1 = _mm_add_epi16(pix1,
2697                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2698                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2699                                         pix3 = _mm_add_epi16(pix3,
2700                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2701                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2702                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2703                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2704                                         pix2 = _mm_add_epi16(pix2,
2705                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2706                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2707                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2708                                 }
2709                                 if (x <= endsub)
2710                                 {
2711                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2712                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2713                                         tci = _mm_madd_epi16(tci, tcoffset);
2714                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2715                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2716                                                                                         _mm_setzero_si128());
2717                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2718                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2719                                                                                         _mm_setzero_si128());
2720                                         fracm = _mm_srli_epi16(subtc, 1);
2721                                         pix1 = _mm_add_epi16(pix1,
2722                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2723                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2724                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2725                                         pix1 = _mm_add_epi16(pix1,
2726                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2727                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2728                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2729                                         x++;
2730                                 }
2731                         }
2732                 }
2733                 else
2734                 {
2735                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2736                         {
2737                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2738                                 {
2739                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2740                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2741                                         tci = _mm_madd_epi16(tci, tcoffset);
2742                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2743                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2744                                 }
2745                                 if (x <= endsub)
2746                                 {
2747                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2748                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2749                                         tci = _mm_madd_epi16(tci, tcoffset);
2750                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2751                                         x++;
2752                                 }
2753                         }
2754                         else
2755                         {
2756                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2757                                 {
2758                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2759                                         tci = _mm_and_si128(tci, tcmax); 
2760                                         tci = _mm_madd_epi16(tci, tcoffset);
2761                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2762                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2763                                 }
2764                                 if (x <= endsub)
2765                                 {
2766                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2767                                         tci = _mm_and_si128(tci, tcmax); 
2768                                         tci = _mm_madd_epi16(tci, tcoffset);
2769                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2770                                         x++;
2771                                 }
2772                         }
2773                 }
2774         }
2775 #endif
2776 }
2777
2778 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2779 {
2780         // TODO: IMPLEMENT
2781         memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2782 }
2783
2784 float DPSOFTRAST_SampleShadowmap(const float *vector)
2785 {
2786         // TODO: IMPLEMENT
2787         return 1.0f;
2788 }
2789
2790 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2791 {
2792         int x;
2793         int startx = span->startx;
2794         int endx = span->endx;
2795         float c[4];
2796         float data[4];
2797         float slope[4];
2798         float z;
2799         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2800         for (x = startx;x < endx;x++)
2801         {
2802                 z = zf[x];
2803                 c[0] = (data[0] + slope[0]*x) * z;
2804                 c[1] = (data[1] + slope[1]*x) * z;
2805                 c[2] = (data[2] + slope[2]*x) * z;
2806                 c[3] = (data[3] + slope[3]*x) * z;
2807                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2808                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2809                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2810                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2811         }
2812 }
2813
2814 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2815 {
2816         int x;
2817         int startx = span->startx;
2818         int endx = span->endx;
2819         float c[4];
2820         float data[4];
2821         float slope[4];
2822         float z;
2823         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2824         for (x = startx;x < endx;x++)
2825         {
2826                 z = zf[x];
2827                 c[0] = (data[0] + slope[0]*x) * z;
2828                 c[1] = (data[1] + slope[1]*x) * z;
2829                 c[2] = (data[2] + slope[2]*x) * z;
2830                 c[3] = (data[3] + slope[3]*x) * z;
2831                 out4f[x*4+0] = c[0];
2832                 out4f[x*4+1] = c[1];
2833                 out4f[x*4+2] = c[2];
2834                 out4f[x*4+3] = c[3];
2835         }
2836 }
2837
2838 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2839 {
2840         int x, startx = span->startx, endx = span->endx;
2841         float c[4], localcolor[4];
2842         localcolor[0] = subcolor[0];
2843         localcolor[1] = subcolor[1];
2844         localcolor[2] = subcolor[2];
2845         localcolor[3] = subcolor[3];
2846         for (x = startx;x < endx;x++)
2847         {
2848                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2849                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2850                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2851                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2852                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2853                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2854                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2855                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2856         }
2857 }
2858
2859 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2860 {
2861         int x, startx = span->startx, endx = span->endx;
2862         for (x = startx;x < endx;x++)
2863         {
2864                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2865                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2866                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2867                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2868         }
2869 }
2870
2871 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2872 {
2873         int x, startx = span->startx, endx = span->endx;
2874         for (x = startx;x < endx;x++)
2875         {
2876                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2877                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2878                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2879                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2880         }
2881 }
2882
2883 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2884 {
2885         int x, startx = span->startx, endx = span->endx;
2886         float a, b;
2887         for (x = startx;x < endx;x++)
2888         {
2889                 a = 1.0f - inb4f[x*4+3];
2890                 b = inb4f[x*4+3];
2891                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2892                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2893                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2894                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2895         }
2896 }
2897
2898 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2899 {
2900         int x, startx = span->startx, endx = span->endx;
2901         float localcolor[4], ilerp, lerp;
2902         localcolor[0] = color[0];
2903         localcolor[1] = color[1];
2904         localcolor[2] = color[2];
2905         localcolor[3] = color[3];
2906         ilerp = 1.0f - localcolor[3];
2907         lerp = localcolor[3];
2908         for (x = startx;x < endx;x++)
2909         {
2910                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2911                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2912                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2913                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2914         }
2915 }
2916
2917
2918
2919 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2920 {
2921 #ifdef SSE_POSSIBLE
2922         int x;
2923         int startx = span->startx;
2924         int endx = span->endx;
2925         __m128 data, slope;
2926         __m128 mod, endmod;
2927         __m128i submod, substep, endsubmod;
2928         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2929         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2930         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2931         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2932         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2933         for (x = startx; x < endx;)
2934         {
2935                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2936                 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2937                 if (nextsub >= endx)
2938                 {
2939                         nextsub = endsub = endx-1;
2940                         if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2941                 }
2942                 mod = endmod;
2943                 submod = endsubmod;
2944                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2945                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2946                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2947                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2948                 substep = _mm_packs_epi32(substep, substep);
2949                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2950                 {
2951                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2952                         pix = _mm_mulhi_epu16(pix, submod);
2953                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2954                 }
2955                 if (x <= endsub)
2956                 {
2957                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2958                         pix = _mm_mulhi_epu16(pix, submod);
2959                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2960                         x++;
2961                 }
2962         }
2963 #endif
2964 }
2965
2966 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2967 {
2968 #ifdef SSE_POSSIBLE
2969         int x;
2970         int startx = span->startx;
2971         int endx = span->endx;
2972         __m128 data, slope;
2973         __m128 mod, endmod;
2974         __m128i submod, substep, endsubmod;
2975         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2976         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2977         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2978         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2979         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2980         for (x = startx; x < endx;)
2981         {
2982                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2983                 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2984                 if (nextsub >= endx)
2985                 {
2986                         nextsub = endsub = endx-1;
2987                         if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
2988                 }
2989                 mod = endmod;
2990                 submod = endsubmod;
2991                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2992                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2993                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2994                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2995                 substep = _mm_packs_epi32(substep, substep);
2996                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2997                 {
2998                         __m128i pix = _mm_srai_epi16(submod, 4);
2999                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3000                 }
3001                 if (x <= endsub)
3002                 {
3003                         __m128i pix = _mm_srai_epi16(submod, 4);
3004                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3005                         x++;
3006                 }
3007         }
3008 #endif
3009 }
3010
3011 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3012 {
3013 #ifdef SSE_POSSIBLE
3014         int x, startx = span->startx, endx = span->endx;
3015         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3016         localcolor = _mm_packs_epi32(localcolor, localcolor);
3017         for (x = startx;x+2 <= endx;x+=2)
3018         {
3019                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3020                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3021                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3022                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3023         }
3024         if (x < endx)
3025         {
3026                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3027                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3028                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3029                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3030         }
3031 #endif
3032 }
3033
3034 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3035 {
3036 #ifdef SSE_POSSIBLE
3037         int x, startx = span->startx, endx = span->endx;
3038         for (x = startx;x+2 <= endx;x+=2)
3039         {
3040                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3041                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3042                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3043                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3044         }
3045         if (x < endx)
3046         {
3047                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3048                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3049                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3050                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3051         }
3052 #endif
3053 }
3054
3055 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3056 {
3057 #ifdef SSE_POSSIBLE
3058         int x, startx = span->startx, endx = span->endx;
3059         for (x = startx;x+2 <= endx;x+=2)
3060         {
3061                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3062                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3063                 pix1 = _mm_add_epi16(pix1, pix2);
3064                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3065         }
3066         if (x < endx)
3067         {
3068                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3069                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3070                 pix1 = _mm_add_epi16(pix1, pix2);
3071                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3072         }
3073 #endif
3074 }
3075
3076 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3077 {
3078 #ifdef SSE_POSSIBLE
3079         int x, startx = span->startx, endx = span->endx;
3080         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3081         tint = _mm_packs_epi32(tint, tint);
3082         for (x = startx;x+2 <= endx;x+=2)
3083         {
3084                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3085                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3086                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3087                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3088         }
3089         if (x < endx)
3090         {
3091                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3092                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3093                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3094                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3095         }
3096 #endif
3097 }
3098
3099 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3100 {
3101 #ifdef SSE_POSSIBLE
3102         int x, startx = span->startx, endx = span->endx;
3103         for (x = startx;x+2 <= endx;x+=2)
3104         {
3105                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3106                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3107                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3108                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3109                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3110         }
3111         if (x < endx)
3112         {
3113                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3114                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3115                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3116                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3117                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3118         }
3119 #endif
3120 }
3121
3122 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3123 {
3124 #ifdef SSE_POSSIBLE
3125         int x, startx = span->startx, endx = span->endx;
3126         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3127         localcolor = _mm_packs_epi32(localcolor, localcolor);
3128         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3129         for (x = startx;x+2 <= endx;x+=2)
3130         {
3131                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3132                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3133                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3134         }
3135         if (x < endx)
3136         {
3137                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3138                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3139                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3140         }
3141 #endif
3142 }
3143
3144
3145
3146 void DPSOFTRAST_VertexShader_Generic(void)
3147 {
3148         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3149         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3150         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3151         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3152                 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3153 }
3154
3155 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3156 {
3157         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3158         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3159         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3160         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3161         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3162         if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3163         {
3164                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3165                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3166                 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3167                 {
3168                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3169                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3170                         {
3171                                 // multiply
3172                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3173                         }
3174                         else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3175                         {
3176                                 // add
3177                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3178                         }
3179                         else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3180                         {
3181                                 // alphablend
3182                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3183                         }
3184                 }
3185         }
3186         else
3187                 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3188         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3189 }
3190
3191
3192
3193 void DPSOFTRAST_VertexShader_PostProcess(void)
3194 {
3195         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3196         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3197         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3198 }
3199
3200 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3201 {
3202         // TODO: optimize!!  at the very least there is no reason to use texture sampling on the frame texture
3203         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3204         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3205         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3206         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3207         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3208         if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3209         {
3210                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3211                 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3212         }
3213         DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3214         if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3215         {
3216                 // TODO: implement saturation
3217         }
3218         if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3219         {
3220                 // TODO: implement gammaramps
3221         }
3222         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3223 }
3224
3225
3226
3227 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3228 {
3229         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3230 }
3231
3232 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3233 {
3234         // this is never called (because colormask is off when this shader is used)
3235         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3236         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3237         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3238         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3239         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3240 }
3241
3242
3243
3244 void DPSOFTRAST_VertexShader_FlatColor(void)
3245 {
3246         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3247         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3248 }
3249
3250 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3251 {
3252 #ifdef SSE_POSSIBLE
3253         unsigned char * RESTRICT pixelmask = span->pixelmask;
3254         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3255         int x, startx = span->startx, endx = span->endx;
3256         __m128i Color_Ambientm;
3257         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3258         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3259         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3260         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3261         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3262         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3263                 pixel = buffer_FragColorbgra8;
3264         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3265         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3266         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3267         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3268         for (x = startx;x < endx;x++)
3269         {
3270                 __m128i color, pix;
3271                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3272                 {
3273                         __m128i pix2;
3274                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3275                         pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3276                         pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3277                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3278                         x += 3;
3279                         continue;
3280                 }
3281                 if (!pixelmask[x])
3282                         continue;
3283                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3284                 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3285                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3286         }
3287         if (pixel == buffer_FragColorbgra8)
3288                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3289 #endif
3290 }
3291
3292
3293
3294 void DPSOFTRAST_VertexShader_VertexColor(void)
3295 {
3296         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3297         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3298         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3299 }
3300
3301 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3302 {
3303 #ifdef SSE_POSSIBLE
3304         unsigned char * RESTRICT pixelmask = span->pixelmask;
3305         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3306         int x, startx = span->startx, endx = span->endx;
3307         __m128i Color_Ambientm, Color_Diffusem;
3308         __m128 data, slope;
3309         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3310         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3311         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3312         int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3313         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3314         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3315         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3316                 pixel = buffer_FragColorbgra8;
3317         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3318         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3319         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3320         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3321         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3322         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3323         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3324         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3325         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3326         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3327         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3328         data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3329         slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3330         for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3331         {
3332                 __m128i color, mod, pix;
3333                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3334                 {
3335                         __m128i pix2, mod2;
3336                         __m128 z = _mm_loadu_ps(&buffer_z[x]);
3337                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3338                         mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3339                         data = _mm_add_ps(data, slope);
3340                         mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3341                         data = _mm_add_ps(data, slope);
3342                         mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3343                         data = _mm_add_ps(data, slope);
3344                         mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3345                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3346                                                                   _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3347                         pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3348                                                                    _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3349                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3350                         x += 3;
3351                         continue;
3352                 }
3353                 if (!pixelmask[x])
3354                         continue;
3355                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3356                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
3357                 mod = _mm_packs_epi32(mod, mod);
3358                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3359                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3360         }
3361         if (pixel == buffer_FragColorbgra8)
3362                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3363 #endif
3364 }
3365
3366
3367
3368 void DPSOFTRAST_VertexShader_Lightmap(void)
3369 {
3370         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3371         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3372         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3373 }
3374
3375 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3376 {
3377 #ifdef SSE_POSSIBLE
3378         unsigned char * RESTRICT pixelmask = span->pixelmask;
3379         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3380         int x, startx = span->startx, endx = span->endx;
3381         __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3382         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3383         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3384         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3385         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3386         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3387         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3388         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3389         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3390         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3391                 pixel = buffer_FragColorbgra8;
3392         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3393         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3394         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3395         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3396         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3397         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3398         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3399         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3400         {
3401                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3402                 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3403                 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3404                 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3405                 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3406                 for (x = startx;x < endx;x++)
3407                 {
3408                         __m128i color, lightmap, glow, pix;
3409                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3410                         {
3411                                 __m128i pix2;
3412                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3413                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3414                                 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3415                                 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3416                                                                                                         _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3417                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3418                                 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3419                                                                                                         _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3420                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3421                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3422                                 x += 3;
3423                                 continue;
3424                         }
3425                         if (!pixelmask[x])
3426                                 continue;
3427                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3428                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3429                         glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3430                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3431                         pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3432                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3433                 }
3434         }
3435         else
3436         {
3437                 for (x = startx;x < endx;x++)
3438                 {
3439                         __m128i color, lightmap, pix;
3440                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3441                         {
3442                                 __m128i pix2;
3443                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3444                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3445                                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3446                                                                           _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3447                                 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3448                                                                            _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3449                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3450                                 x += 3;
3451                                 continue;
3452                         }
3453                         if (!pixelmask[x]) 
3454                                 continue;
3455                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3456                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3457                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3458                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3459                 }
3460         }
3461         if (pixel == buffer_FragColorbgra8)
3462                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3463 #endif
3464 }
3465
3466
3467 void DPSOFTRAST_VertexShader_LightDirection(void);
3468 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3469
3470 void DPSOFTRAST_VertexShader_FakeLight(void)
3471 {
3472         DPSOFTRAST_VertexShader_LightDirection();
3473 }
3474
3475 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3476 {
3477         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3478 }
3479
3480
3481
3482 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3483 {
3484         DPSOFTRAST_VertexShader_LightDirection();
3485         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3486 }
3487
3488 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3489 {
3490         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3491 }
3492
3493
3494
3495 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3496 {
3497         DPSOFTRAST_VertexShader_LightDirection();
3498         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3499 }
3500
3501 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3502 {
3503         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3504 }
3505
3506
3507
3508 void DPSOFTRAST_VertexShader_LightDirection(void)
3509 {
3510         int i;
3511         int numvertices = dpsoftrast.numvertices;
3512         float LightDir[4];
3513         float LightVector[4];
3514         float EyePosition[4];
3515         float EyeVectorModelSpace[4];
3516         float EyeVector[4];
3517         float position[4];
3518         float svector[4];
3519         float tvector[4];
3520         float normal[4];
3521         LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3522         LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3523         LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3524         LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3525         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3526         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3527         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3528         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3529         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3530         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3531         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3532         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3533         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3534         for (i = 0;i < numvertices;i++)
3535         {
3536                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3537                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3538                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3539                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3540                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3541                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3542                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3543                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3544                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3545                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3546                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3547                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3548                 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3549                 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3550                 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3551                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3552                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3553                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3554                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3555                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3556                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3557                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3558                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3559                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3560                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3561                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3562                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3563                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3564                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3565         }
3566         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3567 }
3568
3569 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3570 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3571 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3572 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3573 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3574 #define DPSOFTRAST_Vector3Normalize(v)\
3575 do\
3576 {\
3577         float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3578         if (len)\
3579         {\
3580                 len = 1.0f / len;\
3581                 v[0] *= len;\
3582                 v[1] *= len;\
3583                 v[2] *= len;\
3584         }\
3585 }\
3586 while(0)
3587
3588 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3589 {
3590         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3591         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3592         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3593         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3594         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3595         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3596         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3597         unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3598         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3599         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3600         int x, startx = span->startx, endx = span->endx;
3601         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3602         float LightVectordata[4];
3603         float LightVectorslope[4];
3604         float EyeVectordata[4];
3605         float EyeVectorslope[4];
3606         float VectorSdata[4];
3607         float VectorSslope[4];
3608         float VectorTdata[4];
3609         float VectorTslope[4];
3610         float VectorRdata[4];
3611         float VectorRslope[4];
3612         float z;
3613         float diffusetex[4];
3614         float glosstex[4];
3615         float surfacenormal[4];
3616         float lightnormal[4];
3617         float lightnormal_modelspace[4];
3618         float eyenormal[4];
3619         float specularnormal[4];
3620         float diffuse;
3621         float specular;
3622         float SpecularPower;
3623         int d[4];
3624         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3625         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3626         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3627         Color_Glow[3] = 0.0f;
3628         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3629         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3630         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3631         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3632         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3633         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3634         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3635         Color_Pants[3] = 0.0f;
3636         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3637         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3638         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3639         Color_Shirt[3] = 0.0f;
3640         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3641         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3642         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3643         {
3644                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3645                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3646         }
3647         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3648         {
3649                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3650         }
3651         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3652         {
3653                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3654                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3655                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3656                 Color_Diffuse[3] = 0.0f;
3657                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3658                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3659                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3660                 LightColor[3] = 0.0f;
3661                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3662                 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3663                 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3664                 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3665                 Color_Specular[3] = 0.0f;
3666                 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3667                 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3668                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3669
3670                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3671                 {
3672                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3673                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3674                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3675                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3676                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3677                 }
3678                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3679                 {
3680                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3681                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3682                 }
3683                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3684                 {
3685                         // nothing of this needed
3686                 }
3687                 else
3688                 {
3689                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3690                 }
3691
3692                 for (x = startx;x < endx;x++)
3693                 {
3694                         z = buffer_z[x];
3695                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3696                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3697                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3698                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3699                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3700                         {
3701                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3702                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3703                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3704                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3705                         }
3706                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3707                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3708                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3709                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3710                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3711                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3712                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3713                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3714
3715                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3716                         {
3717                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3718                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3719                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3720                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3721
3722                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3723                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3724                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3725                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3726
3727                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3728                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3729                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3730                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3731
3732                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3733                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3734                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3735                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3736
3737                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3738                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3739
3740                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3741                                 {
3742                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3743                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3744                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3745                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3746                                 }
3747                         }
3748                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3749                         {
3750                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3751                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3752                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3753                                 {
3754                                         float f = 1.0f / 256.0f;
3755                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3756                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3757                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3758                                 }
3759                         }
3760                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3761                         {
3762                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3763                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3764                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3765                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3766
3767                                 LightColor[0] = 1.0;
3768                                 LightColor[1] = 1.0;
3769                                 LightColor[2] = 1.0;
3770                         }
3771                         else
3772                         {
3773                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3774                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3775                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3776                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3777                         }
3778
3779                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3780
3781                         if(thread->shader_exactspecularmath)
3782                         {
3783                                 // reflect lightnormal at surfacenormal, take the negative of that
3784                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3785                                 float f;
3786                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3787                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3788                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3789                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3790
3791                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
3792                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3793                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3794                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3795                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3796
3797                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3798                         }
3799                         else
3800                         {
3801                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3802                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3803                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3804                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3805
3806                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
3807                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
3808                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
3809                                 DPSOFTRAST_Vector3Normalize(specularnormal);
3810
3811                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3812                         }
3813
3814                         specular = pow(specular, SpecularPower * glosstex[3]);
3815                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3816                         {
3817                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3818                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3819                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3820                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3821                         }
3822                         else
3823                         {
3824                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3825                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3826                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3827                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3828                         }
3829
3830                         buffer_FragColorbgra8[x*4+0] = d[0];
3831                         buffer_FragColorbgra8[x*4+1] = d[1];
3832                         buffer_FragColorbgra8[x*4+2] = d[2];
3833                         buffer_FragColorbgra8[x*4+3] = d[3];
3834                 }
3835         }
3836         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3837         {
3838                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3839                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3840                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3841                 Color_Diffuse[3] = 0.0f;
3842                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3843                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3844                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3845                 LightColor[3] = 0.0f;
3846                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3847
3848                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3849                 {
3850                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3851                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3852                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3853                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3854                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3855                 }
3856                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3857                 {
3858                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3859                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3860                 }
3861                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3862                 {
3863                         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3864                 }
3865                 else
3866                 {
3867                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3868                 }
3869
3870                 for (x = startx;x < endx;x++)
3871                 {
3872                         z = buffer_z[x];
3873                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3874                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3875                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3876                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3877                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3878                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3879                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3880                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3881
3882                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3883                         {
3884                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3885                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3886                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3887                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3888
3889                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3890                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3891                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3892                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3893
3894                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3895                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3896                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3897                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3898
3899                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3900                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3901                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3902                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3903
3904                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3905                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3906
3907                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3908                                 {
3909                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3910                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3911                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3912                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3913                                 }
3914                         }
3915                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3916                         {
3917                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3918                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3919                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3920                                 {
3921                                         float f = 1.0f / 256.0f;
3922                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3923                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3924                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3925                                 }
3926                         }
3927                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3928                         {
3929                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3930                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3931                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3932                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3933
3934                                 LightColor[0] = 1.0;
3935                                 LightColor[1] = 1.0;
3936                                 LightColor[2] = 1.0;
3937                         }
3938                         else
3939                         {
3940                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3941                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3942                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3943                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3944                         }
3945
3946                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3947                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3948                         {
3949                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3950                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3951                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3952                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3953                         }
3954                         else
3955                         {
3956                                 d[0] = (int)(                                                + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3957                                 d[1] = (int)(                                                + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3958                                 d[2] = (int)(                                                + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3959                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3960                         }
3961                         buffer_FragColorbgra8[x*4+0] = d[0];
3962                         buffer_FragColorbgra8[x*4+1] = d[1];
3963                         buffer_FragColorbgra8[x*4+2] = d[2];
3964                         buffer_FragColorbgra8[x*4+3] = d[3];
3965                 }
3966         }
3967         else
3968         {
3969                 for (x = startx;x < endx;x++)
3970                 {
3971                         z = buffer_z[x];
3972                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3973                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3974                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3975                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3976
3977                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3978                         {
3979                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3980                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3981                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3982                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3983                         }
3984                         else
3985                         {
3986                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3987                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3988                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3989                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3990                         }
3991                         buffer_FragColorbgra8[x*4+0] = d[0];
3992                         buffer_FragColorbgra8[x*4+1] = d[1];
3993                         buffer_FragColorbgra8[x*4+2] = d[2];
3994                         buffer_FragColorbgra8[x*4+3] = d[3];
3995                 }
3996         }
3997         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3998 }
3999
4000
4001
4002 void DPSOFTRAST_VertexShader_LightSource(void)
4003 {
4004         int i;
4005         int numvertices = dpsoftrast.numvertices;
4006         float LightPosition[4];
4007         float LightVector[4];
4008         float LightVectorModelSpace[4];
4009         float EyePosition[4];
4010         float EyeVectorModelSpace[4];
4011         float EyeVector[4];
4012         float position[4];
4013         float svector[4];
4014         float tvector[4];
4015         float normal[4];
4016         LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4017         LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4018         LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4019         LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4020         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4021         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4022         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4023         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4024         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4025         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4026         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4027         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4028         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4029         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4030         for (i = 0;i < numvertices;i++)
4031         {
4032                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4033                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4034                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4035                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4036                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4037                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4038                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4039                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4040                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4041                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4042                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4043                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4044                 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4045                 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4046                 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4047                 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4048                 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4049                 LightVector[2] = normal[0]  * LightVectorModelSpace[0] + normal[1]  * LightVectorModelSpace[1] + normal[2]  * LightVectorModelSpace[2];
4050                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4051                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4052                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4053                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4054                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4055                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4056                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4057                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4058                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4059                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
4060                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4061                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4062                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4063                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4064         }
4065         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4066         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4067 }
4068
4069 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4070 {
4071 #ifdef SSE_POSSIBLE
4072         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4073         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4074         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4075         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4076         unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4077         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4078         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4079         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4080         int x, startx = span->startx, endx = span->endx;
4081         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4082         float CubeVectordata[4];
4083         float CubeVectorslope[4];
4084         float LightVectordata[4];
4085         float LightVectorslope[4];
4086         float EyeVectordata[4];
4087         float EyeVectorslope[4];
4088         float z;
4089         float diffusetex[4];
4090         float glosstex[4];
4091         float surfacenormal[4];
4092         float lightnormal[4];
4093         float eyenormal[4];
4094         float specularnormal[4];
4095         float diffuse;
4096         float specular;
4097         float SpecularPower;
4098         float CubeVector[4];
4099         float attenuation;
4100         int d[4];
4101         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4102         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4103         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4104         Color_Glow[3] = 0.0f;
4105         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4106         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4107         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4108         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4109         Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4110         Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4111         Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4112         Color_Diffuse[3] = 0.0f;
4113         Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4114         Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4115         Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4116         Color_Specular[3] = 0.0f;
4117         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4118         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4119         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4120         Color_Pants[3] = 0.0f;
4121         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4122         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4123         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4124         Color_Shirt[3] = 0.0f;
4125         LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4126         LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4127         LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4128         LightColor[3] = 0.0f;
4129         SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4130         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4131         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4132         DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4133         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4134         memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4135         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4136         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4137         {
4138                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4139                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4140         }
4141         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4142                 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4143         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4144         {
4145                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4146                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4147                 for (x = startx;x < endx;x++)
4148                 {
4149                         z = buffer_z[x];
4150                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4151                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4152                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4153                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4154                         if (attenuation < 0.01f)
4155                                 continue;
4156                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4157                         {
4158                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4159                                 if (attenuation < 0.01f)
4160                                         continue;
4161                         }
4162
4163                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4164                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4165                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4166                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4167                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4168                         {
4169                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4170                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4171                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4172                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4173                         }
4174                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4175                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4176                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4177                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4178                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4179                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4180                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4181                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4182
4183                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4184                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4185                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4186                         DPSOFTRAST_Vector3Normalize(lightnormal);
4187
4188                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4189
4190                         if(thread->shader_exactspecularmath)
4191                         {
4192                                 // reflect lightnormal at surfacenormal, take the negative of that
4193                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4194                                 float f;
4195                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4196                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4197                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4198                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4199
4200                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
4201                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4202                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4203                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4204                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4205
4206                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4207                         }
4208                         else
4209                         {
4210                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4211                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4212                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4213                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4214
4215                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
4216                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
4217                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
4218                                 DPSOFTRAST_Vector3Normalize(specularnormal);
4219
4220                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4221                         }
4222                         specular = pow(specular, SpecularPower * glosstex[3]);
4223
4224                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4225                         {
4226                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4227                                 attenuation *= (1.0f / 255.0f);
4228                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4229                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4230                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4231                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4232                         }
4233                         else
4234                         {
4235                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4236                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4237                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4238                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4239                         }
4240                         buffer_FragColorbgra8[x*4+0] = d[0];
4241                         buffer_FragColorbgra8[x*4+1] = d[1];
4242                         buffer_FragColorbgra8[x*4+2] = d[2];
4243                         buffer_FragColorbgra8[x*4+3] = d[3];
4244                 }
4245         }
4246         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4247         {
4248                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4249                 for (x = startx;x < endx;x++)
4250                 {
4251                         z = buffer_z[x];
4252                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4253                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4254                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4255                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4256                         if (attenuation < 0.01f)
4257                                 continue;
4258                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4259                         {
4260                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4261                                 if (attenuation < 0.01f)
4262                                         continue;
4263                         }
4264
4265                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4266                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4267                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4268                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4269                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4270                         {
4271                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4272                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4273                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4274                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4275                         }
4276                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4277                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4278                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4279                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4280
4281                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4282                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4283                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4284                         DPSOFTRAST_Vector3Normalize(lightnormal);
4285
4286                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4287                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4288                         {
4289                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4290                                 attenuation *= (1.0f / 255.0f);
4291                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4292                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4293                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4294                                 d[3] = (int)( diffusetex[3]                                                                                                                   );if (d[3] > 255) d[3] = 255;
4295                         }
4296                         else
4297                         {
4298                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4299                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4300                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4301                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4302                         }
4303                         buffer_FragColorbgra8[x*4+0] = d[0];
4304                         buffer_FragColorbgra8[x*4+1] = d[1];
4305                         buffer_FragColorbgra8[x*4+2] = d[2];
4306                         buffer_FragColorbgra8[x*4+3] = d[3];
4307                 }
4308         }
4309         else
4310         {
4311                 for (x = startx;x < endx;x++)
4312                 {
4313                         z = buffer_z[x];
4314                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4315                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4316                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4317                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4318                         if (attenuation < 0.01f)
4319                                 continue;
4320                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4321                         {
4322                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4323                                 if (attenuation < 0.01f)
4324                                         continue;
4325                         }
4326
4327                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4328                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4329                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4330                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4331                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4332                         {
4333                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4334                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4335                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4336                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4337                         }
4338                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4339                         {
4340                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4341                                 attenuation *= (1.0f / 255.0f);
4342                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4343                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4344                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4345                                 d[3] = (int)( diffusetex[3]                                                                                      );if (d[3] > 255) d[3] = 255;
4346                         }
4347                         else
4348                         {
4349                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4350                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4351                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4352                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4353                         }
4354                         buffer_FragColorbgra8[x*4+0] = d[0];
4355                         buffer_FragColorbgra8[x*4+1] = d[1];
4356                         buffer_FragColorbgra8[x*4+2] = d[2];
4357                         buffer_FragColorbgra8[x*4+3] = d[3];
4358                 }
4359         }
4360         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4361 #endif
4362 }
4363
4364
4365
4366 void DPSOFTRAST_VertexShader_Refraction(void)
4367 {
4368         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4369         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4370         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4371 }
4372
4373 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4374 {
4375         // DIRTY TRICK: only do sideways displacement. Not correct, but cheaper and thus better for SW.
4376
4377         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4378         float z;
4379         int x, startx = span->startx, endx = span->endx;
4380
4381         // texture reads
4382         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4383         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4384
4385         // varyings
4386         float ModelViewProjectionPositiondata[4];
4387         float ModelViewProjectionPositionslope[4];
4388
4389         // uniforms
4390         float ScreenScaleRefractReflect[2];
4391         float ScreenCenterRefractReflect[2];
4392         float DistortScaleRefractReflect[2];
4393         float RefractColor[4];
4394
4395         const unsigned char * RESTRICT pixelbase;
4396         const unsigned char * RESTRICT pixel[4];
4397         DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4398         if(!texture) return;
4399         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[0][0];
4400
4401         // read textures
4402         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4403         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4404
4405         // read varyings
4406         DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD1); // or POSITION?
4407
4408         // read uniforms
4409         ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4410         ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4411         ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4412         ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4413         DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4414         DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4415         RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4416         RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4417         RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4418         RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4419
4420         // do stuff
4421         for (x = startx;x < endx;x++)
4422         {
4423                 float SafeScreenTexCoord[2];
4424                 float ScreenTexCoord[2];
4425                 float v[3];
4426                 float iw;
4427                 unsigned char c[4];
4428
4429                 z = buffer_z[x];
4430
4431                 // "    vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4432                 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4433                 
4434                 // "    vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4435                 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4436                 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4437
4438                 // "    vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4439                 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4440                 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4441                 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4442                 DPSOFTRAST_Vector3Normalize(v);
4443                 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4444                 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4445
4446                 // "    dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4447                 if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4448                 {
4449                         unsigned int tc[2] = { ScreenTexCoord[0] * (texture->mipmap[0][2]<<12) - 2048, ScreenTexCoord[1] * (texture->mipmap[0][3]<<12) - 2048};
4450                         unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
4451                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
4452                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
4453                         int tci[2] = { tc[0]>>12, tc[1]>>12 };
4454                         int tci1[2] = { tci[0] + 1, tci[1] + 1 };
4455                         tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4456                         tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4457                         tci1[0] = tci1[0] >= 0 ? (tci1[0] <= texture->mipmap[0][2]-1 ? tci1[0] : texture->mipmap[0][2]-1) : 0;
4458                         tci1[1] = tci1[1] >= 0 ? (tci1[1] <= texture->mipmap[0][3]-1 ? tci1[1] : texture->mipmap[0][3]-1) : 0;
4459                         pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4460                         pixel[1] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci1[0]);
4461                         pixel[2] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci[0]);
4462                         pixel[3] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci1[0]);
4463                         c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
4464                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
4465                         c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
4466                 }
4467                 else
4468                 {
4469                         int tci[2] = { ScreenTexCoord[0] * texture->mipmap[0][2], ScreenTexCoord[1] * texture->mipmap[0][3] };
4470                         tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4471                         tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4472                         pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4473                         c[0] = pixel[0][0];
4474                         c[1] = pixel[0][1];
4475                         c[2] = pixel[0][2];
4476                 }
4477
4478                 //p = (int) bound(startx, x + (ScreenTexCoord[0] - SafeScreenTexCoord[0]) / (ModelViewProjectionPositionslope[0]*z), endx-1);
4479                 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4480                 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4481                 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4482                 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4483         }
4484
4485         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4486 }
4487
4488
4489
4490 void DPSOFTRAST_VertexShader_Water(void)
4491 {
4492         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4493 }
4494
4495
4496 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4497 {
4498         // TODO: IMPLEMENT
4499         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4500         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4501         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4502         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4503         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4504 }
4505
4506
4507
4508 void DPSOFTRAST_VertexShader_ShowDepth(void)
4509 {
4510         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4511 }
4512
4513 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4514 {
4515         // TODO: IMPLEMENT
4516         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4517         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4518         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4519         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4520         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4521 }
4522
4523
4524
4525 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4526 {
4527         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4528 }
4529
4530 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4531 {
4532         // TODO: IMPLEMENT
4533         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4534         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4535         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4536         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4537         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4538 }
4539
4540
4541
4542 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4543 {
4544         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4545 }
4546
4547 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4548 {
4549         // TODO: IMPLEMENT
4550         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4551         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4552         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4553         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4554         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4555 }
4556
4557
4558
4559 typedef struct DPSOFTRAST_ShaderModeInfo_s
4560 {
4561         int lodarrayindex;
4562         void (*Vertex)(void);
4563         void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4564         unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4565         unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4566 }
4567 DPSOFTRAST_ShaderModeInfo;
4568
4569 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4570 {
4571         {2, DPSOFTRAST_VertexShader_Generic,                        DPSOFTRAST_PixelShader_Generic,                        {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4572         {2, DPSOFTRAST_VertexShader_PostProcess,                    DPSOFTRAST_PixelShader_PostProcess,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4573         {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow,                DPSOFTRAST_PixelShader_Depth_Or_Shadow,                {~0}, {~0}},
4574         {2, DPSOFTRAST_VertexShader_FlatColor,                      DPSOFTRAST_PixelShader_FlatColor,                      {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4575         {2, DPSOFTRAST_VertexShader_VertexColor,                    DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4576         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4577         {2, DPSOFTRAST_VertexShader_FakeLight,                      DPSOFTRAST_PixelShader_FakeLight,                      {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4578         {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace,   {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4579         {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4580         {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,                 {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4581         {2, DPSOFTRAST_VertexShader_LightSource,                    DPSOFTRAST_PixelShader_LightSource,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4582         {2, DPSOFTRAST_VertexShader_Refraction,                     DPSOFTRAST_PixelShader_Refraction,                     {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4583         {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                          {~0}},
4584         {2, DPSOFTRAST_VertexShader_ShowDepth,                      DPSOFTRAST_PixelShader_ShowDepth,                      {~0}},
4585         {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,               {~0}},
4586         {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}},
4587 };
4588
4589 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span)
4590 {
4591         int x;
4592         int startx;
4593         int endx;
4594         unsigned int *depthpixel;
4595         int depth;
4596         int depthslope;
4597         unsigned int d;
4598         unsigned char *pixelmask;
4599         DPSOFTRAST_State_Triangle *triangle;
4600         triangle = &thread->triangles[span->triangle];
4601         depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4602         startx = span->startx;
4603         endx = span->endx;
4604         depth = span->depthbase;
4605         depthslope = span->depthslope;
4606         pixelmask = thread->pixelmaskarray;
4607         if (thread->depthtest && dpsoftrast.fb_depthpixels)
4608         {
4609                 switch(thread->fb_depthfunc)
4610                 {
4611                 default:
4612                 case GL_ALWAYS:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4613                 case GL_LESS:    for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4614                 case GL_LEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4615                 case GL_EQUAL:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4616                 case GL_GEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4617                 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4618                 case GL_NEVER:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4619                 }
4620                 while (startx < endx && !pixelmask[startx])
4621                         startx++;
4622                 while (endx > startx && !pixelmask[endx-1])
4623                         endx--;
4624         }
4625         else
4626         {
4627                 // no depth testing means we're just dealing with color...
4628                 memset(pixelmask + startx, 1, endx - startx);
4629         }
4630         span->pixelmask = pixelmask;
4631         span->startx = startx;
4632         span->endx = endx;
4633 }
4634
4635 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span)
4636 {
4637         int x, d, depth, depthslope, startx, endx;
4638         const unsigned char *pixelmask;
4639         unsigned int *depthpixel;
4640         if (thread->depthmask && thread->depthtest && dpsoftrast.fb_depthpixels)
4641         {
4642                 depth = span->depthbase;
4643                 depthslope = span->depthslope;
4644                 pixelmask = span->pixelmask;
4645                 startx = span->startx;
4646                 endx = span->endx;
4647                 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4648                 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4649                         if (pixelmask[x])
4650                                 depthpixel[x] = d;
4651         }
4652 }
4653
4654 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4655 {
4656         int i;
4657         DPSOFTRAST_State_Triangle *triangle;
4658         DPSOFTRAST_State_Span *span;
4659         for (i = 0; i < thread->numspans; i++)
4660         {
4661                 span = &thread->spans[i];
4662                 triangle = &thread->triangles[span->triangle];
4663                 DPSOFTRAST_Draw_DepthTest(thread, span);
4664                 if (span->startx >= span->endx)
4665                         continue;
4666                 // run pixel shader if appropriate
4667                 // do this before running depthmask code, to allow the pixelshader
4668                 // to clear pixelmask values for alpha testing
4669                 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4670                         DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4671                 DPSOFTRAST_Draw_DepthWrite(thread, span);
4672         }
4673         thread->numspans = 0;
4674 }
4675
4676 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4677
4678 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4679 {
4680 #ifdef SSE_POSSIBLE
4681         int cullface = thread->cullface;
4682         int minx, maxx, miny, maxy;
4683         int miny1, maxy1, miny2, maxy2;
4684         __m128i fbmin, fbmax;
4685         __m128 viewportcenter, viewportscale;
4686         int firstvertex = command->firstvertex;
4687         int numvertices = command->numvertices;
4688         int numtriangles = command->numtriangles;
4689         const int *element3i = command->element3i;
4690         const unsigned short *element3s = command->element3s;
4691         int clipped = command->clipped;
4692         int i;
4693         int j;
4694         int k;
4695         int y;
4696         int e[3];
4697         __m128i screeny;
4698         int starty, endy, bandy;
4699         int numpoints;
4700         int clipcase;
4701         float clipdist[4];
4702         float clip0origin, clip0slope;
4703         int clip0dir;
4704         __m128 triangleedge1, triangleedge2, trianglenormal;
4705         __m128 clipfrac[3];
4706         __m128 screen[4];
4707         DPSOFTRAST_State_Triangle *triangle;
4708         DPSOFTRAST_Texture *texture;
4709         DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4710         miny = thread->fb_scissor[1];
4711         maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4712         miny1 = bound(miny, thread->miny1, maxy);
4713         maxy1 = bound(miny, thread->maxy1, maxy);
4714         miny2 = bound(miny, thread->miny2, maxy);
4715         maxy2 = bound(miny, thread->maxy2, maxy);
4716         if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4717         {
4718                 if (!ATOMIC_DECREMENT(command->refcount))
4719                 {
4720                         if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4721                                 MM_FREE(command->arrays);
4722                 }
4723                 return;
4724         }
4725         minx = thread->fb_scissor[0];
4726         maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4727         fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4728         fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4729         viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4730         viewportscale = _mm_load_ps(thread->fb_viewportscale);
4731         screen[3] = _mm_setzero_ps();
4732         clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4733         for (i = 0;i < numtriangles;i++)
4734         {
4735                 const float *screencoord4f = command->arrays;
4736                 const float *arrays = screencoord4f + numvertices*4;
4737
4738                 // generate the 3 edges of this triangle
4739                 // generate spans for the triangle - switch based on left split or right split classification of triangle
4740                 if (element3s)
4741                 {
4742                         e[0] = element3s[i*3+0] - firstvertex;
4743                         e[1] = element3s[i*3+1] - firstvertex;
4744                         e[2] = element3s[i*3+2] - firstvertex;
4745                 }
4746                 else if (element3i)
4747                 {
4748                         e[0] = element3i[i*3+0] - firstvertex;
4749                         e[1] = element3i[i*3+1] - firstvertex;
4750                         e[2] = element3i[i*3+2] - firstvertex;
4751                 }
4752                 else
4753                 {
4754                         e[0] = i*3+0;
4755                         e[1] = i*3+1;
4756                         e[2] = i*3+2;
4757                 }
4758
4759 #define SKIPBACKFACE \
4760                 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4761                 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4762                 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4763                 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4764                                                                         _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4765                 switch(cullface) \
4766                 { \
4767                 case GL_BACK: \
4768                         if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4769                                 continue; \
4770                         break; \
4771                 case GL_FRONT: \
4772                         if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4773                                 continue; \
4774                         break; \
4775                 }
4776
4777 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4778                         clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4779                         { \
4780                                 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4781                                 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4782                         }
4783 #define CLIPPEDVERTEXCOPY(k,p1) \
4784                         screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4785
4786 #define GENATTRIBCOPY(attrib, p1) \
4787                 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4788 #define GENATTRIBLERP(attrib, p1, p2) \
4789                 { \
4790                         __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4791                         attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4792                 }
4793 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4794                 switch(clipcase) \
4795                 { \
4796                 default: \
4797                 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4798                 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4799                 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4800                 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4801                 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4802                 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4803                 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4804                 }
4805
4806                 if (! clipped)
4807                         goto notclipped;
4808
4809                 // calculate distance from nearplane
4810                 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4811                 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4812                 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4813                 if (clipdist[0] >= 0.0f)
4814                 {
4815                         if (clipdist[1] >= 0.0f)
4816                         {
4817                                 if (clipdist[2] >= 0.0f)
4818                                 {
4819                                 notclipped:
4820                                         // triangle is entirely in front of nearplane
4821                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4822                                         SKIPBACKFACE;
4823                                         numpoints = 3;
4824                                         clipcase = 0;
4825                                 }
4826                                 else
4827                                 {
4828                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4829                                         SKIPBACKFACE;
4830                                         numpoints = 4;
4831                                         clipcase = 1;
4832                                 }
4833                         }
4834                         else
4835                         {
4836                                 if (clipdist[2] >= 0.0f)
4837                                 {
4838                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4839                                         SKIPBACKFACE;
4840                                         numpoints = 4;
4841                                         clipcase = 2;
4842                                 }
4843                                 else
4844                                 {
4845                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4846                                         SKIPBACKFACE;
4847                                         numpoints = 3;
4848                                         clipcase = 3;
4849                                 }
4850                         }
4851                 }
4852                 else if (clipdist[1] >= 0.0f)
4853                 {
4854                         if (clipdist[2] >= 0.0f)
4855                         {
4856                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4857                                 SKIPBACKFACE;
4858                                 numpoints = 4;
4859                                 clipcase = 4;
4860                         }
4861                         else
4862                         {
4863                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4864                                 SKIPBACKFACE;
4865                                 numpoints = 3;
4866                                 clipcase = 5;
4867                         }
4868                 }
4869                 else if (clipdist[2] >= 0.0f)
4870                 {
4871                         CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4872                         SKIPBACKFACE;
4873                         numpoints = 3;
4874                         clipcase = 6;
4875                 }
4876                 else continue; // triangle is entirely behind nearplane
4877
4878                 {
4879                         // calculate integer y coords for triangle points
4880                         __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4881                                         screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4882                                         screenmin = _mm_min_epi16(screeni, screenir),
4883                                         screenmax = _mm_max_epi16(screeni, screenir);
4884                         screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4885                         screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4886                         screenmin = _mm_max_epi16(screenmin, fbmin);
4887                         screenmax = _mm_min_epi16(screenmax, fbmax);
4888                         // skip offscreen triangles
4889                         if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4890                                 continue;
4891                         starty = _mm_extract_epi16(screenmin, 1);
4892                         endy = _mm_extract_epi16(screenmax, 1)+1;
4893                         if (starty >= maxy1 && endy <= miny2)
4894                                 continue;
4895                         screeny = _mm_srai_epi32(screeni, 16);
4896                 }
4897
4898                 triangle = &thread->triangles[thread->numtriangles];
4899
4900                 // calculate attribute plans for triangle data...
4901                 // okay, this triangle is going to produce spans, we'd better project
4902                 // the interpolants now (this is what gives perspective texturing),
4903                 // this consists of simply multiplying all arrays by the W coord
4904                 // (which is basically 1/Z), which will be undone per-pixel
4905                 // (multiplying by Z again) to get the perspective-correct array
4906                 // values
4907                 {
4908                         __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4909                         __m128 mipedgescale, mipdensity;
4910                         attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4911                         attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4912                         attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4913                         attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4914                         attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4915                         w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4916                         w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4917                         w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4918                         attribedge1 = _mm_sub_ss(w0, w1);
4919                         attribedge2 = _mm_sub_ss(w2, w1);
4920                         attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4921                         attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4922                         x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4923                         y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4924                         attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4925                         _mm_store_ss(&triangle->w[0], attribxslope);
4926                         _mm_store_ss(&triangle->w[1], attribyslope);
4927                         _mm_store_ss(&triangle->w[2], attriborigin);
4928                         
4929                         clip0origin = 0;
4930                         clip0slope = 0;
4931                         clip0dir = 0;
4932                         if(thread->fb_clipplane[0] || thread->fb_clipplane[1] || thread->fb_clipplane[2])
4933                         {
4934                                 float cliporigin, clipxslope, clipyslope;
4935                                 attriborigin = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(2, 2, 2, 2));
4936                                 attribedge1 = _mm_sub_ss(_mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
4937                                 attribedge2 = _mm_sub_ss(_mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
4938                                 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4939                                 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4940                                 attriborigin = _mm_sub_ss(attriborigin, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4941                                 cliporigin = _mm_cvtss_f32(attriborigin)*thread->fb_clipplane[2] + thread->fb_clipplane[3];
4942                                 clipxslope = thread->fb_clipplane[0] + _mm_cvtss_f32(attribxslope)*thread->fb_clipplane[2];
4943                                 clipyslope = thread->fb_clipplane[1] + _mm_cvtss_f32(attribyslope)*thread->fb_clipplane[2];
4944                                 if(clipxslope != 0)
4945                                 {
4946                                         clip0origin = -cliporigin/clipxslope;
4947                                         clip0slope = -clipyslope/clipxslope;
4948                                         clip0dir = clipxslope > 0 ? 1 : -1;
4949                                 }
4950                                 else if(clipyslope > 0)
4951                                 {
4952                                         clip0origin = dpsoftrast.fb_width*floor(cliporigin/clipyslope);
4953                                         clip0slope = dpsoftrast.fb_width;
4954                                         clip0dir = -1;
4955                                 }
4956                                 else if(clipyslope < 0)
4957                                 {
4958                                         clip0origin = dpsoftrast.fb_width*ceil(cliporigin/clipyslope);
4959                                         clip0slope = -dpsoftrast.fb_width;
4960                                         clip0dir = -1;
4961                                 }
4962                                 else if(clip0origin < 0) continue;
4963                         }
4964
4965                         mipedgescale = _mm_setzero_ps();
4966                         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4967                         {
4968                                 __m128 attrib0, attrib1, attrib2;
4969                                 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4970                                 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4971                                         break;
4972                                 arrays += numvertices*4;
4973                                 GENATTRIBS(attrib0, attrib1, attrib2);
4974                                 attriborigin = _mm_mul_ps(attrib1, w1);
4975                                 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4976                                 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4977                                 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4978                                 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4979                                 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4980                                 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
4981                                 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
4982                                 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
4983                                 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4984                                 {
4985                                         mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4986                                         mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4987                                         mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4988                                         mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4989                                 }
4990                         }
4991
4992                         memset(triangle->mip, 0, sizeof(triangle->mip));
4993                         for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4994                         {
4995                                 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4996                                 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4997                                         break;
4998                                 texture = thread->texbound[texunit];
4999                                 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
5000                                 {
5001                                         mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
5002                                         mipdensity = _mm_mul_ps(mipdensity, mipdensity);
5003                                         mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
5004                                         mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
5005                                         // this will be multiplied in the texturing routine by the texture resolution
5006                                         y = _mm_cvtss_si32(mipdensity);
5007                                         if (y > 0)
5008                                         {
5009                                                 y = (int)(log((float)y)*0.5f/M_LN2);
5010                                                 if (y > texture->mipmaps - 1)
5011                                                         y = texture->mipmaps - 1;
5012                                                 triangle->mip[texunit] = y;
5013                                         }
5014                                 }
5015                         }
5016                 }
5017         
5018                 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
5019                 for (; y < bandy;)
5020                 {
5021                         __m128 xcoords, xslope;
5022                         __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
5023                         int yccmask = _mm_movemask_epi8(ycc);
5024                         int edge0p, edge0n, edge1p, edge1n;
5025                         int nexty;
5026                         float w, wslope;
5027                         float clip0;
5028                         if (numpoints == 4)
5029                         {
5030                                 switch(yccmask)
5031                                 {
5032                                 default:
5033                                 case 0xFFFF: /*0000*/ y = endy; continue;
5034                                 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
5035                                 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5036                                 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
5037                                 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
5038                                 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5039                                 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5040                                 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5041                                 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5042                                 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5043                                 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5044                                 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5045                                 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5046                                 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5047                                 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5048                                 case 0x0000: /*1111*/ y++; continue;
5049                                 }
5050                         }
5051                         else
5052                         {
5053                                 switch(yccmask)
5054                                 {
5055                                 default:
5056                                 case 0xFFFF: /*000*/ y = endy; continue;
5057                                 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5058                                 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5059                                 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5060                                 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5061                                 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5062                                 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5063                                 case 0x0000: /*111*/ y++; continue;
5064                                 }
5065                         }
5066                         ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5067                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5068                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5069                         nexty = _mm_extract_epi16(ycc, 0);
5070                         if (nexty >= bandy) nexty = bandy-1;
5071                         xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5072                         xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5073                         xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5074                                                                 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5075                         xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5076                         if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5077                         {
5078                                 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5079                                 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5080                         }
5081                         clip0 = clip0origin + (y+0.5f)*clip0slope + 0.5f;
5082                         for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope), clip0 += clip0slope)
5083                         {
5084                                 int startx, endx, offset;
5085                                 startx = _mm_cvtss_si32(xcoords);
5086                                 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5087                                 if (startx < minx) startx = minx;
5088                                 if (endx > maxx) endx = maxx;
5089                                 if (startx >= endx) continue;
5090
5091                                 if (clip0dir)
5092                                 {
5093                                         if (clip0dir > 0)
5094                                         {
5095                                                 if (startx < clip0) 
5096                                                 {
5097                                                         if(endx <= clip0) continue;
5098                                                         startx = (int)clip0;
5099                                                 }
5100                                         }
5101                                         else if (endx > clip0) 
5102                                         {
5103                                                 if(startx >= clip0) continue;
5104                                                 endx = (int)clip0;
5105                                         }
5106                                 }
5107                                                 
5108                                 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5109                                 {
5110                                         DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5111                                         span->triangle = thread->numtriangles;
5112                                         span->x = offset;
5113                                         span->y = y;
5114                                         span->startx = 0;
5115                                         span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5116                                         if (span->startx >= span->endx)
5117                                                 continue;
5118                                         wslope = triangle->w[0];
5119                                         w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
5120                                         span->depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
5121                                         span->depthbase = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
5122                                         if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5123                                                 DPSOFTRAST_Draw_ProcessSpans(thread);
5124                                 }
5125                         }
5126                 }
5127
5128                 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5129                 {
5130                         DPSOFTRAST_Draw_ProcessSpans(thread);
5131                         thread->numtriangles = 0;
5132                 }
5133         }
5134
5135         if (!ATOMIC_DECREMENT(command->refcount))
5136         {
5137                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5138                         MM_FREE(command->arrays);
5139         }
5140
5141         if (thread->numspans > 0 || thread->numtriangles > 0)
5142         {
5143                 DPSOFTRAST_Draw_ProcessSpans(thread);
5144                 thread->numtriangles = 0;
5145         }
5146 #endif
5147 }
5148
5149 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5150 {
5151         int i;
5152         int j;
5153         int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5154         int datasize = 2*numvertices*sizeof(float[4]);
5155         DPSOFTRAST_Command_Draw *command;
5156         unsigned char *data;
5157         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5158         {
5159                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5160                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5161                         break;
5162                 datasize += numvertices*sizeof(float[4]);
5163         }
5164         if (element3s)
5165                 datasize += numtriangles*sizeof(unsigned short[3]);
5166         else if (element3i)
5167                 datasize += numtriangles*sizeof(int[3]);
5168         datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5169         if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5170         {
5171                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5172                 data = (unsigned char *)MM_CALLOC(datasize, 1);
5173         }
5174         else
5175         {
5176                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5177                 data = (unsigned char *)command + commandsize;
5178         }
5179         command->firstvertex = firstvertex;
5180         command->numvertices = numvertices;
5181         command->numtriangles = numtriangles;
5182         command->arrays = (float *)data;
5183         memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5184         dpsoftrast.firstvertex = firstvertex;
5185         dpsoftrast.numvertices = numvertices;
5186         dpsoftrast.screencoord4f = (float *)data;
5187         data += numvertices*sizeof(float[4]);
5188         dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5189         data += numvertices*sizeof(float[4]);
5190         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5191         {
5192                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5193                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5194                         break;
5195                 dpsoftrast.post_array4f[j] = (float *)data;
5196                 data += numvertices*sizeof(float[4]);
5197         }
5198         command->element3i = NULL;
5199         command->element3s = NULL;
5200         if (element3s)
5201         {
5202                 command->element3s = (unsigned short *)data;
5203                 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5204         }
5205         else if (element3i)
5206         {
5207                 command->element3i = (int *)data;
5208                 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5209         }
5210         return command;
5211 }
5212
5213 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5214 {
5215         DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5216         DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5217         command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5218         command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5219         if (command->starty >= command->endy)
5220         {
5221                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5222                         MM_FREE(command->arrays);
5223                 DPSOFTRAST_UndoCommand(command->commandsize);
5224                 return;
5225         }
5226         command->clipped = dpsoftrast.drawclipped;
5227         command->refcount = dpsoftrast.numthreads;
5228
5229         if (dpsoftrast.usethreads)
5230         {
5231                 int i;
5232                 DPSOFTRAST_Draw_SyncCommands();
5233                 for (i = 0; i < dpsoftrast.numthreads; i++)
5234                 {
5235                         DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5236                         if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5237                                 Thread_CondSignal(thread->drawcond);
5238                 }
5239         }
5240         else
5241         {
5242                 DPSOFTRAST_Draw_FlushThreads();
5243         }
5244 }
5245
5246 DEFCOMMAND(23, SetRenderTargets, int width; int height;);
5247 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5248 {
5249         thread->validate |= DPSOFTRAST_VALIDATE_FB;
5250 }
5251 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5252 {
5253         DPSOFTRAST_Command_SetRenderTargets *command;
5254         if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5255                 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5256                 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5257                 DPSOFTRAST_Flush();
5258         dpsoftrast.fb_width = width;
5259         dpsoftrast.fb_height = height;
5260         dpsoftrast.fb_depthpixels = depthpixels;
5261         dpsoftrast.fb_colorpixels[0] = colorpixels0;
5262         dpsoftrast.fb_colorpixels[1] = colorpixels1;
5263         dpsoftrast.fb_colorpixels[2] = colorpixels2;
5264         dpsoftrast.fb_colorpixels[3] = colorpixels3;
5265         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5266         command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5267         command->width = width;
5268         command->height = height;
5269 }
5270  
5271 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5272 {
5273         int commandoffset = thread->commandoffset;
5274         while (commandoffset != endoffset)
5275         {
5276                 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5277                 switch (command->opcode)
5278                 {
5279 #define INTERPCOMMAND(name) \
5280                 case DPSOFTRAST_OPCODE_##name : \
5281                         DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5282                         commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5283                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5284                                 commandoffset = 0; \
5285                         break;
5286                 INTERPCOMMAND(Viewport)
5287                 INTERPCOMMAND(ClearColor)
5288                 INTERPCOMMAND(ClearDepth)
5289                 INTERPCOMMAND(ColorMask)
5290                 INTERPCOMMAND(DepthTest)
5291                 INTERPCOMMAND(ScissorTest)
5292                 INTERPCOMMAND(Scissor)
5293                 INTERPCOMMAND(BlendFunc)
5294                 INTERPCOMMAND(BlendSubtract)
5295                 INTERPCOMMAND(DepthMask)
5296                 INTERPCOMMAND(DepthFunc)
5297                 INTERPCOMMAND(DepthRange)
5298                 INTERPCOMMAND(PolygonOffset)
5299                 INTERPCOMMAND(CullFace)
5300                 INTERPCOMMAND(AlphaTest)
5301                 INTERPCOMMAND(AlphaFunc)
5302                 INTERPCOMMAND(SetTexture)
5303                 INTERPCOMMAND(SetShader)
5304                 INTERPCOMMAND(Uniform4f)
5305                 INTERPCOMMAND(UniformMatrix4f)
5306                 INTERPCOMMAND(Uniform1i)
5307                 INTERPCOMMAND(SetRenderTargets)
5308                 INTERPCOMMAND(ClipPlane)
5309
5310                 case DPSOFTRAST_OPCODE_Draw:
5311                         DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5312                         commandoffset += command->commandsize;
5313                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5314                                 commandoffset = 0;
5315                         thread->commandoffset = commandoffset;
5316                         break;
5317
5318                 case DPSOFTRAST_OPCODE_Reset:
5319                         commandoffset = 0;
5320                         break;
5321                 }
5322         }
5323         thread->commandoffset = commandoffset;
5324 }
5325
5326 static int DPSOFTRAST_Draw_Thread(void *data)
5327 {
5328         DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5329         while(thread->index >= 0)
5330         {
5331                 if (thread->commandoffset != dpsoftrast.drawcommand)
5332                 {
5333                         DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);      
5334                 }
5335                 else 
5336                 {
5337                         Thread_LockMutex(thread->drawmutex);
5338                         if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5339                         {
5340                                 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5341                                 thread->starving = true;
5342                                 Thread_CondWait(thread->drawcond, thread->drawmutex);
5343                                 thread->starving = false;
5344                         }
5345                         Thread_UnlockMutex(thread->drawmutex);
5346                 }
5347         }   
5348         return 0;
5349 }
5350
5351 static void DPSOFTRAST_Draw_FlushThreads(void)
5352 {
5353         DPSOFTRAST_State_Thread *thread;
5354         int i;
5355         DPSOFTRAST_Draw_SyncCommands();
5356         if (dpsoftrast.usethreads) 
5357         {
5358                 for (i = 0; i < dpsoftrast.numthreads; i++)
5359                 {
5360                         thread = &dpsoftrast.threads[i];
5361                         if (thread->commandoffset != dpsoftrast.drawcommand)
5362                         {
5363                                 Thread_LockMutex(thread->drawmutex);
5364                                 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5365                                         Thread_CondSignal(thread->drawcond);
5366                                 Thread_UnlockMutex(thread->drawmutex);
5367                         }
5368                 }
5369                 for (i = 0; i < dpsoftrast.numthreads; i++)
5370                 {
5371                         thread = &dpsoftrast.threads[i];
5372                         if (thread->commandoffset != dpsoftrast.drawcommand)
5373                         {
5374                                 Thread_LockMutex(thread->drawmutex);
5375                                 if (thread->commandoffset != dpsoftrast.drawcommand)
5376                                 {
5377                                         thread->waiting = true;
5378                                         Thread_CondWait(thread->waitcond, thread->drawmutex);
5379                                         thread->waiting = false;
5380                                 }
5381                                 Thread_UnlockMutex(thread->drawmutex);
5382                         }
5383                 }
5384         }
5385         else
5386         {
5387                 for (i = 0; i < dpsoftrast.numthreads; i++)
5388                 {
5389                         thread = &dpsoftrast.threads[i];
5390                         if (thread->commandoffset != dpsoftrast.drawcommand)
5391                                 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5392                 }
5393         }
5394         dpsoftrast.commandpool.usedcommands = 0;
5395 }
5396
5397 void DPSOFTRAST_Flush(void)
5398 {
5399         DPSOFTRAST_Draw_FlushThreads();
5400 }
5401
5402 void DPSOFTRAST_Finish(void)
5403 {
5404         DPSOFTRAST_Flush();
5405 }
5406
5407 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5408 {
5409         int i;
5410         union
5411         {
5412                 int i;
5413                 unsigned char b[4];
5414         }
5415         u;
5416         u.i = 1;
5417         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5418         dpsoftrast.bigendian = u.b[3];
5419         dpsoftrast.fb_width = width;
5420         dpsoftrast.fb_height = height;
5421         dpsoftrast.fb_depthpixels = depthpixels;
5422         dpsoftrast.fb_colorpixels[0] = colorpixels;
5423         dpsoftrast.fb_colorpixels[1] = NULL;
5424         dpsoftrast.fb_colorpixels[1] = NULL;
5425         dpsoftrast.fb_colorpixels[1] = NULL;
5426         dpsoftrast.viewport[0] = 0;
5427         dpsoftrast.viewport[1] = 0;
5428         dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5429         dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5430         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5431         dpsoftrast.texture_firstfree = 1;
5432         dpsoftrast.texture_end = 1;
5433         dpsoftrast.texture_max = 0;
5434         dpsoftrast.color[0] = 1;
5435         dpsoftrast.color[1] = 1;
5436         dpsoftrast.color[2] = 1;
5437         dpsoftrast.color[3] = 1;
5438         dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5439         dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5440         dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5441         dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5442         for (i = 0; i < dpsoftrast.numthreads; i++)
5443         {
5444                 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5445                 thread->index = i;
5446                 thread->cullface = GL_BACK;
5447         thread->colormask[0] = 1; 
5448                 thread->colormask[1] = 1;
5449                 thread->colormask[2] = 1;
5450                 thread->colormask[3] = 1;
5451                 thread->blendfunc[0] = GL_ONE;
5452                 thread->blendfunc[1] = GL_ZERO;
5453                 thread->depthmask = true;
5454                 thread->depthtest = true;
5455                 thread->depthfunc = GL_LEQUAL;
5456                 thread->scissortest = false;
5457                 thread->alphatest = false;
5458                 thread->alphafunc = GL_GREATER;
5459                 thread->alphavalue = 0.5f;
5460                 thread->viewport[0] = 0;
5461                 thread->viewport[1] = 0;
5462                 thread->viewport[2] = dpsoftrast.fb_width;
5463                 thread->viewport[3] = dpsoftrast.fb_height;
5464                 thread->scissor[0] = 0;
5465                 thread->scissor[1] = 0;
5466                 thread->scissor[2] = dpsoftrast.fb_width;
5467                 thread->scissor[3] = dpsoftrast.fb_height;
5468                 thread->depthrange[0] = 0;
5469                 thread->depthrange[1] = 1;
5470                 thread->polygonoffset[0] = 0;
5471                 thread->polygonoffset[1] = 0;
5472                 thread->clipplane[0] = 0;
5473                 thread->clipplane[1] = 0;
5474                 thread->clipplane[2] = 0;
5475                 thread->clipplane[3] = 1;
5476         
5477                 thread->numspans = 0;
5478                 thread->numtriangles = 0;
5479                 thread->commandoffset = 0;
5480                 thread->waiting = false;
5481                 thread->starving = false;
5482            
5483                 thread->validate = -1;
5484                 DPSOFTRAST_Validate(thread, -1);
5485  
5486                 if (dpsoftrast.usethreads)
5487                 {
5488                         thread->waitcond = Thread_CreateCond();
5489                         thread->drawcond = Thread_CreateCond();
5490                         thread->drawmutex = Thread_CreateMutex();
5491                         thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5492                 }
5493         }
5494         return 0;
5495 }
5496
5497 void DPSOFTRAST_Shutdown(void)
5498 {
5499         int i;
5500         if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5501         {
5502                 DPSOFTRAST_State_Thread *thread;
5503                 for (i = 0; i < dpsoftrast.numthreads; i++)
5504                 {
5505                         thread = &dpsoftrast.threads[i];
5506                         Thread_LockMutex(thread->drawmutex);
5507                         thread->index = -1;
5508                         Thread_CondSignal(thread->drawcond);
5509                         Thread_UnlockMutex(thread->drawmutex);
5510                         Thread_WaitThread(thread->thread, 0);
5511                         Thread_DestroyCond(thread->waitcond);
5512                         Thread_DestroyCond(thread->drawcond);
5513                         Thread_DestroyMutex(thread->drawmutex);
5514                 }
5515         }
5516         for (i = 0;i < dpsoftrast.texture_end;i++)
5517                 if (dpsoftrast.texture[i].bytes)
5518                         MM_FREE(dpsoftrast.texture[i].bytes);
5519         if (dpsoftrast.texture)
5520                 free(dpsoftrast.texture);
5521         if (dpsoftrast.threads)
5522                 MM_FREE(dpsoftrast.threads);
5523         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5524 }
5525