]> git.xonotic.org Git - xonotic/darkplaces.git/blob - dpsoftrast.c
optimize scanning for 0 alpha in FinishBGRA8
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "thread.h"
7 #include "dpsoftrast.h"
8
9 #ifdef _MSC_VER
10 #pragma warning(disable : 4324)
11 #endif
12
13 #ifndef __cplusplus
14 typedef qboolean bool;
15 #endif
16
17 #define ALIGN_SIZE 16
18 #define ATOMIC_SIZE 32
19
20 #ifdef SSE_POSSIBLE
21         #if defined(__APPLE__)
22                 #include <libkern/OSAtomic.h>
23                 #define ALIGN(var) var __attribute__((__aligned__(16)))
24                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
25                 #define MEMORY_BARRIER (_mm_sfence())
26                 #define ATOMIC_COUNTER volatile int32_t 
27                 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28                 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29                 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30         #elif defined(__GNUC__)
31                 #define ALIGN(var) var __attribute__((__aligned__(16)))
32                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
33                 #define MEMORY_BARRIER (_mm_sfence())
34                 //(__sync_synchronize())
35                 #define ATOMIC_COUNTER volatile int
36                 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
37                 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
38                 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
39         #elif defined(_MSC_VER)
40                 #define ALIGN(var) __declspec(align(16)) var
41                 #define ATOMIC(var) __declspec(align(32)) var
42                 #define MEMORY_BARRIER (_mm_sfence())
43                 //(MemoryBarrier())
44                 #define ATOMIC_COUNTER volatile LONG
45                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
46                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
47                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
48         #endif
49 #endif
50
51 #ifndef ALIGN
52 #define ALIGN(var) var
53 #endif
54 #ifndef ATOMIC
55 #define ATOMIC(var) var
56 #endif
57 #ifndef MEMORY_BARRIER
58 #define MEMORY_BARRIER ((void)0)
59 #endif
60 #ifndef ATOMIC_COUNTER
61 #define ATOMIC_COUNTER int
62 #endif
63 #ifndef ATOMIC_INCREMENT
64 #define ATOMIC_INCREMENT(counter) (++(counter))
65 #endif
66 #ifndef ATOMIC_DECREMENT
67 #define ATOMIC_DECREMENT(counter) (--(counter))
68 #endif
69 #ifndef ATOMIC_ADD
70 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
71 #endif
72
73 #ifdef SSE_POSSIBLE
74 #include <emmintrin.h>
75
76 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
77
78 static void *MM_CALLOC(size_t nmemb, size_t size)
79 {
80         void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
81         if (ptr != NULL) memset(ptr, 0, nmemb*size);
82         return ptr;
83 }
84
85 #define MM_FREE _mm_free
86 #else
87 #define MM_MALLOC(size) malloc(size)
88 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
89 #define MM_FREE free
90 #endif
91
92 typedef enum DPSOFTRAST_ARRAY_e
93 {
94         DPSOFTRAST_ARRAY_POSITION,
95         DPSOFTRAST_ARRAY_COLOR,
96         DPSOFTRAST_ARRAY_TEXCOORD0,
97         DPSOFTRAST_ARRAY_TEXCOORD1,
98         DPSOFTRAST_ARRAY_TEXCOORD2,
99         DPSOFTRAST_ARRAY_TEXCOORD3,
100         DPSOFTRAST_ARRAY_TEXCOORD4,
101         DPSOFTRAST_ARRAY_TEXCOORD5,
102         DPSOFTRAST_ARRAY_TEXCOORD6,
103         DPSOFTRAST_ARRAY_TEXCOORD7,
104         DPSOFTRAST_ARRAY_TOTAL
105 }
106 DPSOFTRAST_ARRAY;
107
108 typedef struct DPSOFTRAST_Texture_s
109 {
110         int flags;
111         int width;
112         int height;
113         int depth;
114         int sides;
115         DPSOFTRAST_TEXTURE_FILTER filter;
116         int mipmaps;
117         int size;
118         ATOMIC_COUNTER binds;
119         unsigned char *bytes;
120         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
121 }
122 DPSOFTRAST_Texture;
123
124 #define COMMAND_SIZE ALIGN_SIZE
125 #define COMMAND_ALIGN(var) ALIGN(var)
126
127 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
128 {
129         unsigned char opcode;
130         unsigned short commandsize;
131 }
132 DPSOFTRAST_Command);
133
134 enum { DPSOFTRAST_OPCODE_Reset = 0 };
135
136 #define DEFCOMMAND(opcodeval, name, fields) \
137         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
138         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
139         { \
140                 unsigned char opcode; \
141                 unsigned short commandsize; \
142                 fields \
143         } DPSOFTRAST_Command_##name );
144
145 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
146 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
147
148 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
149 {
150         int freecommand;
151         int usedcommands;
152         ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
153 }
154 DPSOFTRAST_State_Command_Pool);
155
156 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
157 {
158         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
159         float w[3];
160         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
161 }
162 DPSOFTRAST_State_Triangle);
163
164 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
165         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
166         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
167                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
168                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
169 }
170 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
171         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
172         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
173         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
174         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
175         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
176         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
177         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
178         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
179 }
180                                         
181 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
182
183 typedef ALIGN(struct DPSOFTRAST_State_Span_s
184 {
185         int triangle; // triangle this span was generated by
186         int x; // framebuffer x coord
187         int y; // framebuffer y coord
188         int startx; // usable range (according to pixelmask)
189         int endx; // usable range (according to pixelmask)
190         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
191         int depthbase; // depthbuffer value at x (add depthslope*startx to get first pixel's depthbuffer value)
192         int depthslope; // depthbuffer value pixel delta
193 }
194 DPSOFTRAST_State_Span);
195
196 #define DPSOFTRAST_DRAW_MAXSPANS 1024
197 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
198 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
199
200 #define DPSOFTRAST_VALIDATE_FB 1
201 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
202 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
203 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
204
205 typedef enum DPSOFTRAST_BLENDMODE_e
206 {
207         DPSOFTRAST_BLENDMODE_OPAQUE,
208         DPSOFTRAST_BLENDMODE_ALPHA,
209         DPSOFTRAST_BLENDMODE_ADDALPHA,
210         DPSOFTRAST_BLENDMODE_ADD,
211         DPSOFTRAST_BLENDMODE_INVMOD,
212         DPSOFTRAST_BLENDMODE_MUL,
213         DPSOFTRAST_BLENDMODE_MUL2,
214         DPSOFTRAST_BLENDMODE_SUBALPHA,
215         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
216         DPSOFTRAST_BLENDMODE_INVADD,
217         DPSOFTRAST_BLENDMODE_TOTAL
218 }
219 DPSOFTRAST_BLENDMODE;
220
221 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
222 {
223         void *thread;
224         int index;
225         
226         int cullface;
227         int colormask[4];
228         int blendfunc[2];
229         int blendsubtract;
230         int depthmask;
231         int depthtest;
232         int depthfunc;
233         int scissortest;
234         int alphatest;
235         int alphafunc;
236         float alphavalue;
237         int viewport[4];
238         int scissor[4];
239         float depthrange[2];
240         float polygonoffset[2];
241         float clipplane[4];
242         ALIGN(float fb_clipplane[4]);
243
244         int shader_mode;
245         int shader_permutation;
246         int shader_exactspecularmath;
247
248         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
249         
250         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
251         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
252
253         // DPSOFTRAST_VALIDATE_ flags
254         int validate;
255
256         // derived values (DPSOFTRAST_VALIDATE_FB)
257         int fb_colormask;
258         int fb_scissor[4];
259         ALIGN(float fb_viewportcenter[4]);
260         ALIGN(float fb_viewportscale[4]);
261
262         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
263         int fb_depthfunc;
264
265         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
266         int fb_blendmode;
267
268         // band boundaries
269         int miny1;
270         int maxy1;
271         int miny2;
272         int maxy2;
273
274         ATOMIC(volatile int commandoffset);
275
276         volatile bool waiting;
277         volatile bool starving;
278         void *waitcond;
279         void *drawcond;
280         void *drawmutex;
281
282         int numspans;
283         int numtriangles;
284         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
285         DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
286         unsigned char pixelmaskarray[DPSOFTRAST_DRAW_MAXSPANLENGTH+4]; // LordHavoc: padded to allow some termination bytes
287 }
288 DPSOFTRAST_State_Thread);
289
290 typedef ATOMIC(struct DPSOFTRAST_State_s
291 {
292         int fb_width;
293         int fb_height;
294         unsigned int *fb_depthpixels;
295         unsigned int *fb_colorpixels[4];
296
297         int viewport[4];
298         ALIGN(float fb_viewportcenter[4]);
299         ALIGN(float fb_viewportscale[4]);
300
301         float color[4];
302         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
303         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
304
305         const float *pointer_vertex3f;
306         const float *pointer_color4f;
307         const unsigned char *pointer_color4ub;
308         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
309         int stride_vertex;
310         int stride_color;
311         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
312         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
313         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
314
315         int firstvertex;
316         int numvertices;
317         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
318         float *screencoord4f;
319         int drawstarty;
320         int drawendy;
321         int drawclipped;
322         
323         int shader_mode;
324         int shader_permutation;
325         int shader_exactspecularmath;
326
327         int texture_max;
328         int texture_end;
329         int texture_firstfree;
330         DPSOFTRAST_Texture *texture;
331
332         int bigendian;
333
334         // error reporting
335         const char *errorstring;
336
337         bool usethreads;
338         int interlace;
339         int numthreads;
340         DPSOFTRAST_State_Thread *threads;
341
342         ATOMIC(volatile int drawcommand);
343
344         DPSOFTRAST_State_Command_Pool commandpool;
345 }
346 DPSOFTRAST_State);
347
348 DPSOFTRAST_State dpsoftrast;
349
350 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
351 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
352 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
353 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
354
355 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span);
356 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span);
357
358 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
359 {
360         fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
361         fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
362         fb_viewportcenter[3] = 0.5f;
363         fb_viewportcenter[0] = 0.0f;
364         fb_viewportscale[1] = 0.5f * viewport[2];
365         fb_viewportscale[2] = -0.5f * viewport[3];
366         fb_viewportscale[3] = 0.5f;
367         fb_viewportscale[0] = 1.0f;
368 }
369
370 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
371 {
372         if (dpsoftrast.interlace)
373         {
374                 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
375                 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
376                 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
377                 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
378         }
379         else
380         {
381                 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
382                 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
383         }
384 }
385
386 static void DPSOFTRAST_RecalcClipPlane(DPSOFTRAST_State_Thread *thread)
387 {
388         thread->fb_clipplane[0] = thread->clipplane[0] / thread->fb_viewportscale[1];
389         thread->fb_clipplane[1] = thread->clipplane[1] / thread->fb_viewportscale[2];
390         thread->fb_clipplane[2] = thread->clipplane[2] / thread->fb_viewportscale[3];
391         thread->fb_clipplane[3] = thread->clipplane[3] / thread->fb_viewportscale[0];
392         thread->fb_clipplane[3] -= thread->fb_viewportcenter[1]*thread->fb_clipplane[0] + thread->fb_viewportcenter[2]*thread->fb_clipplane[1] + thread->fb_viewportcenter[3]*thread->fb_clipplane[2] + thread->fb_viewportcenter[0]*thread->fb_clipplane[3];
393 }
394
395 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
396 {
397         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
398         // and viewport projection values
399         int x1, x2;
400         int y1, y2;
401         x1 = thread->scissor[0];
402         x2 = thread->scissor[0] + thread->scissor[2];
403         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
404         y2 = dpsoftrast.fb_height - thread->scissor[1];
405         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
406         if (x1 < 0) x1 = 0;
407         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
408         if (y1 < 0) y1 = 0;
409         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
410         thread->fb_scissor[0] = x1;
411         thread->fb_scissor[1] = y1;
412         thread->fb_scissor[2] = x2 - x1;
413         thread->fb_scissor[3] = y2 - y1;
414
415         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
416         DPSOFTRAST_RecalcClipPlane(thread);
417         DPSOFTRAST_RecalcThread(thread);
418 }
419
420 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
421 {
422         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
423 }
424
425 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
426 {
427         if (thread->blendsubtract)
428         {
429                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
430                 {
431                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
432                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
433                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
434                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
435                 }
436         }
437         else
438         {       
439                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
440                 {
441                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
442                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
443                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
444                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
445                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
446                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
447                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
448                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
449                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
450                 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
451                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
452                 }
453         }
454 }
455
456 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
457
458 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
459 {
460         mask &= thread->validate;
461         if (!mask)
462                 return;
463         if (mask & DPSOFTRAST_VALIDATE_FB)
464         {
465                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
466                 DPSOFTRAST_RecalcFB(thread);
467         }
468         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
469         {
470                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
471                 DPSOFTRAST_RecalcDepthFunc(thread);
472         }
473         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
474         {
475                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
476                 DPSOFTRAST_RecalcBlendFunc(thread);
477         }
478 }
479
480 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
481 {
482         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
483                 return &dpsoftrast.texture[index];
484         return NULL;
485 }
486
487 static void DPSOFTRAST_Texture_Grow(void)
488 {
489         DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
490         DPSOFTRAST_State_Thread *thread;
491         int i;
492         int j;
493         DPSOFTRAST_Flush();
494         // expand texture array as needed
495         if (dpsoftrast.texture_max < 1024)
496                 dpsoftrast.texture_max = 1024;
497         else
498                 dpsoftrast.texture_max *= 2;
499         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
500         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
501                 if (dpsoftrast.texbound[i])
502                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
503         for (j = 0; j < dpsoftrast.numthreads; j++)
504         {
505                 thread = &dpsoftrast.threads[j];
506                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
507                         if (thread->texbound[i])
508                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
509         }
510 }
511
512 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
513 {
514         int w;
515         int h;
516         int d;
517         int size;
518         int s;
519         int texnum;
520         int mipmaps;
521         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
522         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
523         DPSOFTRAST_Texture *texture;
524         if (width*height*depth < 1)
525         {
526                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
527                 return 0;
528         }
529         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
530         {
531                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
532                 return 0;
533         }
534         switch(texformat)
535         {
536         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
537         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
538         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
539                 break;
540         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
541                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
542                 {
543                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
544                         return 0;
545                 }
546                 if (depth != 1)
547                 {
548                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
549                         return 0;
550                 }
551                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
552                 {
553                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
554                         return 0;
555                 }
556                 break;
557         }
558         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
559         {
560                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
561                 return 0;
562         }
563         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
564         {
565                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
566                 return 0;
567         }
568         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
569         {
570                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
571                 return 0;
572         }
573         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
574         {
575                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
576                 return 0;
577         }
578         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
579         {
580                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
581                 return 0;
582         }
583         // find first empty slot in texture array
584         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
585                 if (!dpsoftrast.texture[texnum].bytes)
586                         break;
587         dpsoftrast.texture_firstfree = texnum + 1;
588         if (dpsoftrast.texture_max <= texnum)
589                 DPSOFTRAST_Texture_Grow();
590         if (dpsoftrast.texture_end <= texnum)
591                 dpsoftrast.texture_end = texnum + 1;
592         texture = &dpsoftrast.texture[texnum];
593         memset(texture, 0, sizeof(*texture));
594         texture->flags = flags;
595         texture->width = width;
596         texture->height = height;
597         texture->depth = depth;
598         texture->sides = sides;
599         texture->binds = 0;
600         w = width;
601         h = height;
602         d = depth;
603         size = 0;
604         mipmaps = 0;
605         w = width;
606         h = height;
607         d = depth;
608         for (;;)
609         {
610                 s = w * h * d * sides * 4;
611                 texture->mipmap[mipmaps][0] = size;
612                 texture->mipmap[mipmaps][1] = s;
613                 texture->mipmap[mipmaps][2] = w;
614                 texture->mipmap[mipmaps][3] = h;
615                 texture->mipmap[mipmaps][4] = d;
616                 size += s;
617                 mipmaps++;
618                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
619                         break;
620                 if (w > 1) w >>= 1;
621                 if (h > 1) h >>= 1;
622                 if (d > 1) d >>= 1;
623         }
624         texture->mipmaps = mipmaps;
625         texture->size = size;
626
627         // allocate the pixels now
628         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
629
630         return texnum;
631 }
632 void DPSOFTRAST_Texture_Free(int index)
633 {
634         DPSOFTRAST_Texture *texture;
635         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
636         if (texture->binds)
637                 DPSOFTRAST_Flush();
638         if (texture->bytes)
639                 MM_FREE(texture->bytes);
640         texture->bytes = NULL;
641         memset(texture, 0, sizeof(*texture));
642         // adjust the free range and used range
643         if (dpsoftrast.texture_firstfree > index)
644                 dpsoftrast.texture_firstfree = index;
645         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
646                 dpsoftrast.texture_end--;
647 }
648 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
649 {
650         int i, x, y, z, w, layer0, layer1, row0, row1;
651         unsigned char *o, *i0, *i1, *i2, *i3;
652         DPSOFTRAST_Texture *texture;
653         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
654         if (texture->mipmaps <= 1)
655                 return;
656         for (i = 1;i < texture->mipmaps;i++)
657         {
658                 for (z = 0;z < texture->mipmap[i][4];z++)
659                 {
660                         layer0 = z*2;
661                         layer1 = z*2+1;
662                         if (layer1 >= texture->mipmap[i-1][4])
663                                 layer1 = texture->mipmap[i-1][4]-1;
664                         for (y = 0;y < texture->mipmap[i][3];y++)
665                         {
666                                 row0 = y*2;
667                                 row1 = y*2+1;
668                                 if (row1 >= texture->mipmap[i-1][3])
669                                         row1 = texture->mipmap[i-1][3]-1;
670                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
671                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
672                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
673                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
674                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
675                                 w = texture->mipmap[i][2];
676                                 if (layer1 > layer0)
677                                 {
678                                         if (texture->mipmap[i-1][2] > 1)
679                                         {
680                                                 // average 3D texture
681                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
682                                                 {
683                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
684                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
685                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
686                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
687                                                 }
688                                         }
689                                         else
690                                         {
691                                                 // average 3D mipmap with parent width == 1
692                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
693                                                 {
694                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
695                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
696                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
697                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
698                                                 }
699                                         }
700                                 }
701                                 else
702                                 {
703                                         if (texture->mipmap[i-1][2] > 1)
704                                         {
705                                                 // average 2D texture (common case)
706                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
707                                                 {
708                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
709                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
710                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
711                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
712                                                 }
713                                         }
714                                         else
715                                         {
716                                                 // 2D texture with parent width == 1
717                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
718                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
719                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
720                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
721                                         }
722                                 }
723                         }
724                 }
725         }
726 }
727 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
728 {
729         DPSOFTRAST_Texture *texture;
730         unsigned char *dst;
731         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
732         if (texture->binds)
733                 DPSOFTRAST_Flush();
734         if (pixels)
735         {
736                 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
737                 while (blockheight > 0)
738                 {
739                         memcpy(dst, pixels, blockwidth * 4);
740                         pixels += blockwidth * 4;
741                         dst += texture->mipmap[0][2] * 4;
742                         blockheight--;
743                 }
744         }
745         DPSOFTRAST_Texture_CalculateMipmaps(index);
746 }
747 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
748 {
749         DPSOFTRAST_Texture *texture;
750         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
751         if (texture->binds)
752                 DPSOFTRAST_Flush();
753         if (pixels)
754                 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
755         DPSOFTRAST_Texture_CalculateMipmaps(index);
756 }
757 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
758 {
759         DPSOFTRAST_Texture *texture;
760         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
761         return texture->mipmap[mip][2];
762 }
763 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
764 {
765         DPSOFTRAST_Texture *texture;
766         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
767         return texture->mipmap[mip][3];
768 }
769 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
770 {
771         DPSOFTRAST_Texture *texture;
772         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
773         return texture->mipmap[mip][4];
774 }
775 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
776 {
777         DPSOFTRAST_Texture *texture;
778         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
779         if (texture->binds)
780                 DPSOFTRAST_Flush();
781         return texture->bytes + texture->mipmap[mip][0];
782 }
783 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
784 {
785         DPSOFTRAST_Texture *texture;
786         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
787         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
788         {
789                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
790                 return;
791         }
792         if (texture->binds)
793                 DPSOFTRAST_Flush();
794         texture->filter = filter;
795 }
796
797 static void DPSOFTRAST_Draw_FlushThreads(void);
798
799 static void DPSOFTRAST_Draw_SyncCommands(void)
800 {
801         if(dpsoftrast.usethreads) MEMORY_BARRIER;
802         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
803 }
804
805 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
806 {
807         DPSOFTRAST_State_Thread *thread;
808         int i;
809         int freecommand = dpsoftrast.commandpool.freecommand;
810         int usedcommands = dpsoftrast.commandpool.usedcommands;
811         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
812                 return;
813         DPSOFTRAST_Draw_SyncCommands();
814         for(;;)
815         {
816                 int waitindex = -1;
817                 int commandoffset;
818                 usedcommands = 0;
819                 for (i = 0; i < dpsoftrast.numthreads; i++)
820                 {
821                         thread = &dpsoftrast.threads[i]; 
822                         commandoffset = freecommand - thread->commandoffset;
823                         if (commandoffset < 0)
824                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
825                         if (commandoffset > usedcommands)
826                         {
827                                 waitindex = i;
828                                 usedcommands = commandoffset;
829                         }
830                 }
831                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
832                         break;
833                 thread = &dpsoftrast.threads[waitindex];
834                 Thread_LockMutex(thread->drawmutex);
835                 if (thread->commandoffset != dpsoftrast.drawcommand)
836                 {
837                         thread->waiting = true;
838                         if (thread->starving) Thread_CondSignal(thread->drawcond);
839                         Thread_CondWait(thread->waitcond, thread->drawmutex);
840                         thread->waiting = false;
841                 }
842                 Thread_UnlockMutex(thread->drawmutex);
843         }
844         dpsoftrast.commandpool.usedcommands = usedcommands;
845 }
846
847 #define DPSOFTRAST_ALIGNCOMMAND(size) \
848         ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
849 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
850         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
851
852 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
853 {
854         DPSOFTRAST_Command *command;
855         int freecommand = dpsoftrast.commandpool.freecommand;
856         int usedcommands = dpsoftrast.commandpool.usedcommands;
857         int extra = sizeof(DPSOFTRAST_Command);
858         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
859                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
860         if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
861         {
862                 if (dpsoftrast.usethreads)
863                         DPSOFTRAST_Draw_FreeCommandPool(size + extra);
864                 else
865                         DPSOFTRAST_Draw_FlushThreads();
866                 freecommand = dpsoftrast.commandpool.freecommand;
867                 usedcommands = dpsoftrast.commandpool.usedcommands;
868         }
869         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
870         {
871                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
872                 command->opcode = DPSOFTRAST_OPCODE_Reset;
873                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
874                 freecommand = 0;
875         }
876         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
877         command->opcode = opcode;
878         command->commandsize = size;
879         freecommand += size;
880         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
881                 freecommand = 0;
882         dpsoftrast.commandpool.freecommand = freecommand;
883         dpsoftrast.commandpool.usedcommands = usedcommands + size;
884         return command;
885 }
886
887 static void DPSOFTRAST_UndoCommand(int size)
888 {
889         int freecommand = dpsoftrast.commandpool.freecommand;
890         int usedcommands = dpsoftrast.commandpool.usedcommands;
891         freecommand -= size;
892         if (freecommand < 0)
893                 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
894         usedcommands -= size;
895         dpsoftrast.commandpool.freecommand = freecommand;
896         dpsoftrast.commandpool.usedcommands = usedcommands;
897 }
898                 
899 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
900 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
901 {
902         thread->viewport[0] = command->x;
903         thread->viewport[1] = command->y;
904         thread->viewport[2] = command->width;
905         thread->viewport[3] = command->height;
906         thread->validate |= DPSOFTRAST_VALIDATE_FB;
907 }
908 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
909 {
910         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
911         command->x = x;
912         command->y = y;
913         command->width = width;
914         command->height = height;
915
916         dpsoftrast.viewport[0] = x;
917         dpsoftrast.viewport[1] = y;
918         dpsoftrast.viewport[2] = width;
919         dpsoftrast.viewport[3] = height;
920         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
921 }
922
923 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
924 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
925 {
926         int i, x1, y1, x2, y2, w, h, x, y;
927         int miny1, maxy1, miny2, maxy2;
928         int bandy;
929         unsigned int *p;
930         unsigned int c;
931         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
932         miny1 = thread->miny1;
933         maxy1 = thread->maxy1;
934         miny2 = thread->miny2;
935         maxy2 = thread->maxy2;
936         x1 = thread->fb_scissor[0];
937         y1 = thread->fb_scissor[1];
938         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
939         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
940         if (y1 < miny1) y1 = miny1;
941         if (y2 > maxy2) y2 = maxy2;
942         w = x2 - x1;
943         h = y2 - y1;
944         if (w < 1 || h < 1)
945                 return;
946         // FIXME: honor fb_colormask?
947         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
948         for (i = 0;i < 4;i++)
949         {
950                 if (!dpsoftrast.fb_colorpixels[i])
951                         continue;
952                 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
953                 for (;y < bandy;y++)
954                 {
955                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
956                         for (x = x1;x < x2;x++)
957                                 p[x] = c;
958                 }
959         }
960 }
961 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
962 {
963         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
964         command->r = r;
965         command->g = g;
966         command->b = b;
967         command->a = a;
968 }
969
970 DEFCOMMAND(3, ClearDepth, float depth;)
971 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
972 {
973         int x1, y1, x2, y2, w, h, x, y;
974         int miny1, maxy1, miny2, maxy2;
975         int bandy;
976         unsigned int *p;
977         unsigned int c;
978         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
979         miny1 = thread->miny1;
980         maxy1 = thread->maxy1;
981         miny2 = thread->miny2;
982         maxy2 = thread->maxy2;
983         x1 = thread->fb_scissor[0];
984         y1 = thread->fb_scissor[1];
985         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
986         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
987         if (y1 < miny1) y1 = miny1;
988         if (y2 > maxy2) y2 = maxy2;
989         w = x2 - x1;
990         h = y2 - y1;
991         if (w < 1 || h < 1)
992                 return;
993         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
994         for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
995         for (;y < bandy;y++)
996         {
997                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
998                 for (x = x1;x < x2;x++)
999                         p[x] = c;
1000         }
1001 }
1002 void DPSOFTRAST_ClearDepth(float d)
1003 {
1004         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
1005         command->depth = d;
1006 }
1007
1008 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
1009 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
1010 {
1011         thread->colormask[0] = command->r != 0;
1012         thread->colormask[1] = command->g != 0;
1013         thread->colormask[2] = command->b != 0;
1014         thread->colormask[3] = command->a != 0;
1015         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
1016 }
1017 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1018 {
1019         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1020         command->r = r;
1021         command->g = g;
1022         command->b = b;
1023         command->a = a;
1024 }
1025
1026 DEFCOMMAND(5, DepthTest, int enable;)
1027 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1028 {
1029         thread->depthtest = command->enable;
1030         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1031 }
1032 void DPSOFTRAST_DepthTest(int enable)
1033 {
1034         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1035         command->enable = enable;
1036 }
1037
1038 DEFCOMMAND(6, ScissorTest, int enable;)
1039 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1040 {
1041         thread->scissortest = command->enable;
1042         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1043 }
1044 void DPSOFTRAST_ScissorTest(int enable)
1045 {
1046         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1047         command->enable = enable;
1048 }
1049
1050 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1051 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1052 {
1053         thread->scissor[0] = command->x;
1054         thread->scissor[1] = command->y;
1055         thread->scissor[2] = command->width;
1056         thread->scissor[3] = command->height;
1057         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1058 }
1059 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1060 {
1061         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1062         command->x = x;
1063         command->y = y;
1064         command->width = width;
1065         command->height = height;
1066 }
1067
1068 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1069 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1070 {
1071         thread->blendfunc[0] = command->sfactor;
1072         thread->blendfunc[1] = command->dfactor;
1073         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1074 }
1075 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1076 {
1077         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1078         command->sfactor = sfactor;
1079         command->dfactor = dfactor;
1080 }
1081
1082 DEFCOMMAND(9, BlendSubtract, int enable;)
1083 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1084 {
1085         thread->blendsubtract = command->enable;
1086         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1087 }
1088 void DPSOFTRAST_BlendSubtract(int enable)
1089 {
1090         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1091         command->enable = enable;
1092 }
1093
1094 DEFCOMMAND(10, DepthMask, int enable;)
1095 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1096 {
1097         thread->depthmask = command->enable;
1098 }
1099 void DPSOFTRAST_DepthMask(int enable)
1100 {
1101         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1102         command->enable = enable;
1103 }
1104
1105 DEFCOMMAND(11, DepthFunc, int func;)
1106 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1107 {
1108         thread->depthfunc = command->func;
1109 }
1110 void DPSOFTRAST_DepthFunc(int func)
1111 {
1112         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1113         command->func = func;
1114 }
1115
1116 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1117 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1118 {
1119         thread->depthrange[0] = command->nearval;
1120         thread->depthrange[1] = command->farval;
1121 }
1122 void DPSOFTRAST_DepthRange(float nearval, float farval)
1123 {
1124         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1125         command->nearval = nearval;
1126         command->farval = farval;
1127 }
1128
1129 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1130 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1131 {
1132         thread->polygonoffset[0] = command->alongnormal;
1133         thread->polygonoffset[1] = command->intoview;
1134 }
1135 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1136 {
1137         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1138         command->alongnormal = alongnormal;
1139         command->intoview = intoview;
1140 }
1141
1142 DEFCOMMAND(14, CullFace, int mode;)
1143 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1144 {
1145         thread->cullface = command->mode;
1146 }
1147 void DPSOFTRAST_CullFace(int mode)
1148 {
1149         DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1150         command->mode = mode;
1151 }
1152
1153 DEFCOMMAND(15, AlphaTest, int enable;)
1154 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1155 {
1156         thread->alphatest = command->enable;
1157 }
1158 void DPSOFTRAST_AlphaTest(int enable)
1159 {
1160         DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1161         command->enable = enable;
1162 }
1163
1164 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1165 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1166 {
1167         thread->alphafunc = command->func;
1168         thread->alphavalue = command->ref;
1169 }
1170 void DPSOFTRAST_AlphaFunc(int func, float ref)
1171 {
1172         DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1173         command->func = func;
1174         command->ref = ref;
1175 }
1176
1177 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1178 {
1179         dpsoftrast.color[0] = r;
1180         dpsoftrast.color[1] = g;
1181         dpsoftrast.color[2] = b;
1182         dpsoftrast.color[3] = a;
1183 }
1184
1185 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1186 {
1187         int outstride = blockwidth * 4;
1188         int instride = dpsoftrast.fb_width * 4;
1189         int bx1 = blockx;
1190         int by1 = blocky;
1191         int bx2 = blockx + blockwidth;
1192         int by2 = blocky + blockheight;
1193         int bw;
1194         int x;
1195         int y;
1196         unsigned char *inpixels;
1197         unsigned char *b;
1198         unsigned char *o;
1199         DPSOFTRAST_Flush();
1200         if (bx1 < 0) bx1 = 0;
1201         if (by1 < 0) by1 = 0;
1202         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1203         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1204         bw = bx2 - bx1;
1205         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1206         if (dpsoftrast.bigendian)
1207         {
1208                 for (y = by1;y < by2;y++)
1209                 {
1210                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1211                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1212                         for (x = bx1;x < bx2;x++)
1213                         {
1214                                 o[0] = b[3];
1215                                 o[1] = b[2];
1216                                 o[2] = b[1];
1217                                 o[3] = b[0];
1218                                 o += 4;
1219                                 b += 4;
1220                         }
1221                 }
1222         }
1223         else
1224         {
1225                 for (y = by1;y < by2;y++)
1226                 {
1227                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1228                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1229                         memcpy(o, b, bw*4);
1230                 }
1231         }
1232
1233 }
1234 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1235 {
1236         int tx1 = tx;
1237         int ty1 = ty;
1238         int tx2 = tx + width;
1239         int ty2 = ty + height;
1240         int sx1 = sx;
1241         int sy1 = sy;
1242         int sx2 = sx + width;
1243         int sy2 = sy + height;
1244         int swidth;
1245         int sheight;
1246         int twidth;
1247         int theight;
1248         int sw;
1249         int sh;
1250         int tw;
1251         int th;
1252         int y;
1253         unsigned int *spixels;
1254         unsigned int *tpixels;
1255         DPSOFTRAST_Texture *texture;
1256         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1257         if (mip < 0 || mip >= texture->mipmaps) return;
1258         DPSOFTRAST_Flush();
1259         spixels = dpsoftrast.fb_colorpixels[0];
1260         swidth = dpsoftrast.fb_width;
1261         sheight = dpsoftrast.fb_height;
1262         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1263         twidth = texture->mipmap[mip][2];
1264         theight = texture->mipmap[mip][3];
1265         if (tx1 < 0) tx1 = 0;
1266         if (ty1 < 0) ty1 = 0;
1267         if (tx2 > twidth) tx2 = twidth;
1268         if (ty2 > theight) ty2 = theight;
1269         if (sx1 < 0) sx1 = 0;
1270         if (sy1 < 0) sy1 = 0;
1271         if (sx2 > swidth) sx2 = swidth;
1272         if (sy2 > sheight) sy2 = sheight;
1273         tw = tx2 - tx1;
1274         th = ty2 - ty1;
1275         sw = sx2 - sx1;
1276         sh = sy2 - sy1;
1277         if (tw > sw) tw = sw;
1278         if (th > sh) th = sh;
1279         if (tw < 1 || th < 1)
1280                 return;
1281         sy1 = sheight - 1 - sy1;
1282         for (y = 0;y < th;y++)
1283                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1284         if (texture->mipmaps > 1)
1285                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1286 }
1287
1288 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1289 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1290 {
1291         if (thread->texbound[command->unitnum])
1292                 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1293         thread->texbound[command->unitnum] = command->texture;
1294 }
1295 void DPSOFTRAST_SetTexture(int unitnum, int index)
1296 {
1297         DPSOFTRAST_Command_SetTexture *command;
1298         DPSOFTRAST_Texture *texture;
1299         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1300         {
1301                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1302                 return;
1303         }
1304         texture = DPSOFTRAST_Texture_GetByIndex(index);
1305         if (index && !texture)
1306         {
1307                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1308                 return;
1309         }
1310
1311         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1312         command->unitnum = unitnum;
1313         command->texture = texture;
1314
1315         dpsoftrast.texbound[unitnum] = texture;
1316         ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1317 }
1318
1319 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1320 {
1321         dpsoftrast.pointer_vertex3f = vertex3f;
1322         dpsoftrast.stride_vertex = stride;
1323 }
1324 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1325 {
1326         dpsoftrast.pointer_color4f = color4f;
1327         dpsoftrast.pointer_color4ub = NULL;
1328         dpsoftrast.stride_color = stride;
1329 }
1330 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1331 {
1332         dpsoftrast.pointer_color4f = NULL;
1333         dpsoftrast.pointer_color4ub = color4ub;
1334         dpsoftrast.stride_color = stride;
1335 }
1336 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1337 {
1338         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1339         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1340         dpsoftrast.stride_texcoord[unitnum] = stride;
1341 }
1342
1343 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1344 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1345 {
1346         thread->shader_mode = command->mode;
1347         thread->shader_permutation = command->permutation;
1348         thread->shader_exactspecularmath = command->exactspecularmath;
1349 }
1350 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1351 {
1352         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1353         command->mode = mode;
1354         command->permutation = permutation;
1355         command->exactspecularmath = exactspecularmath;
1356
1357         dpsoftrast.shader_mode = mode;
1358         dpsoftrast.shader_permutation = permutation;
1359         dpsoftrast.shader_exactspecularmath = exactspecularmath;
1360 }
1361
1362 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1363 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1364 {
1365         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1366 }
1367 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1368 {
1369         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1370         command->index = index;
1371         command->val[0] = v0;
1372         command->val[1] = v1;
1373         command->val[2] = v2;
1374         command->val[3] = v3;
1375
1376         dpsoftrast.uniform4f[index*4+0] = v0;
1377         dpsoftrast.uniform4f[index*4+1] = v1;
1378         dpsoftrast.uniform4f[index*4+2] = v2;
1379         dpsoftrast.uniform4f[index*4+3] = v3;
1380 }
1381 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1382 {
1383         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1384         command->index = index;
1385         memcpy(command->val, v, sizeof(command->val));
1386
1387         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1388 }
1389
1390 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1391 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1392 {
1393         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1394 }
1395 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1396 {
1397 #ifdef SSE_POSSIBLE
1398         int i, index;
1399         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1400         {
1401                 __m128 m0, m1, m2, m3;
1402                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1403                 command->index = (DPSOFTRAST_UNIFORM)index;
1404                 if (((size_t)v)&(ALIGN_SIZE-1))
1405                 {
1406                         m0 = _mm_loadu_ps(v);
1407                         m1 = _mm_loadu_ps(v+4);
1408                         m2 = _mm_loadu_ps(v+8);
1409                         m3 = _mm_loadu_ps(v+12);
1410                 }
1411                 else
1412                 {
1413                         m0 = _mm_load_ps(v);
1414                         m1 = _mm_load_ps(v+4);
1415                         m2 = _mm_load_ps(v+8);
1416                         m3 = _mm_load_ps(v+12);
1417                 }
1418                 if (transpose)
1419                 {
1420                         __m128 t0, t1, t2, t3;
1421                         t0 = _mm_unpacklo_ps(m0, m1);
1422                         t1 = _mm_unpacklo_ps(m2, m3);
1423                         t2 = _mm_unpackhi_ps(m0, m1);
1424                         t3 = _mm_unpackhi_ps(m2, m3);
1425                         m0 = _mm_movelh_ps(t0, t1);
1426                         m1 = _mm_movehl_ps(t1, t0);
1427                         m2 = _mm_movelh_ps(t2, t3);
1428                         m3 = _mm_movehl_ps(t3, t2);                     
1429                 }
1430                 _mm_store_ps(command->val, m0);
1431                 _mm_store_ps(command->val+4, m1);
1432                 _mm_store_ps(command->val+8, m2);
1433                 _mm_store_ps(command->val+12, m3);
1434                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1435                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1436                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1437                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1438         }
1439 #endif
1440 }
1441
1442 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1443 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1444 {
1445         thread->uniform1i[command->index] = command->val;
1446 }
1447 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1448 {
1449         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1450         command->index = index;
1451         command->val = i0;
1452
1453         dpsoftrast.uniform1i[command->index] = i0;
1454 }
1455
1456 DEFCOMMAND(24, ClipPlane, float clipplane[4];)
1457 static void DPSOFTRAST_Interpret_ClipPlane(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClipPlane *command)
1458 {
1459         memcpy(thread->clipplane, command->clipplane, 4*sizeof(float));
1460         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1461 }
1462 void DPSOFTRAST_ClipPlane(float x, float y, float z, float w)
1463 {
1464         DPSOFTRAST_Command_ClipPlane *command = DPSOFTRAST_ALLOCATECOMMAND(ClipPlane);
1465         command->clipplane[0] = x;
1466         command->clipplane[1] = y;
1467         command->clipplane[2] = z;
1468         command->clipplane[3] = w;
1469 }
1470
1471 #ifdef SSE_POSSIBLE
1472 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1473 {
1474         float *end = dst + size*4;
1475         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1476         {
1477                 while (dst < end)
1478                 {
1479                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1480                         dst += 4;
1481                         src += stride;
1482                 }
1483         }
1484         else
1485         {
1486                 while (dst < end)
1487                 {
1488                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1489                         dst += 4;
1490                         src += stride;
1491                 }
1492         }
1493 }
1494
1495 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1496 {
1497         float *end = dst + size*4;
1498         if (stride == sizeof(float[3]))
1499         {
1500                 float *end4 = dst + (size&~3)*4;        
1501                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1502                 {
1503                         while (dst < end4)
1504                         {
1505                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1506                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1507                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1508                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1509                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1510                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1511                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1512                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1513                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1514                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1515                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1516                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1517                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1518                                 dst += 16;
1519                                 src += 4*sizeof(float[3]);
1520                         }
1521                 }
1522                 else
1523                 {
1524                         while (dst < end4)
1525                         {
1526                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1527                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1528                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1529                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1530                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1531                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1532                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1533                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1534                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1535                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1536                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1537                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1538                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1539                                 dst += 16;
1540                                 src += 4*sizeof(float[3]);
1541                         }
1542                 }
1543         }
1544         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1545         {
1546                 while (dst < end)
1547                 {
1548                         __m128 v = _mm_loadu_ps((const float *)src);
1549                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1550                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1551                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1552                         _mm_store_ps(dst, v);
1553                         dst += 4;
1554                         src += stride;
1555                 }
1556         }
1557         else
1558         {
1559                 while (dst < end)
1560                 {
1561                         __m128 v = _mm_load_ps((const float *)src);
1562                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1563                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1564                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1565                         _mm_store_ps(dst, v);
1566                         dst += 4;
1567                         src += stride;
1568                 }
1569         }
1570 }
1571
1572 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1573 {
1574         float *end = dst + size*4;
1575         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1576         if (stride == sizeof(float[2]))
1577         {
1578                 float *end2 = dst + (size&~1)*4;
1579                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1580                 {
1581                         while (dst < end2)
1582                         {
1583                                 __m128 v = _mm_loadu_ps((const float *)src);
1584                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1585                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1586                                 dst += 8;
1587                                 src += 2*sizeof(float[2]);
1588                         }
1589                 }
1590                 else
1591                 {
1592                         while (dst < end2)
1593                         {
1594                                 __m128 v = _mm_load_ps((const float *)src);
1595                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1596                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1597                                 dst += 8;
1598                                 src += 2*sizeof(float[2]);
1599                         }
1600                 }
1601         }
1602         while (dst < end)
1603         {
1604                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1605                 dst += 4;
1606                 src += stride;
1607         }
1608 }
1609
1610 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1611 {
1612         float *end = dst + size*4;
1613         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1614         if (stride == sizeof(unsigned char[4]))
1615         {
1616                 float *end4 = dst + (size&~3)*4;
1617                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1618                 {
1619                         while (dst < end4)
1620                         {
1621                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1622                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1623                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1624                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1625                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1626                                 dst += 16;
1627                                 src += 4*sizeof(unsigned char[4]);
1628                         }
1629                 }
1630                 else
1631                 {
1632                         while (dst < end4)
1633                         {
1634                                 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1635                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1636                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1637                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1638                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1639                                 dst += 16;
1640                                 src += 4*sizeof(unsigned char[4]);
1641                         }
1642                 }
1643         }
1644         while (dst < end)
1645         {
1646                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1647                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1648                 dst += 4;
1649                 src += stride;
1650         }
1651 }
1652
1653 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1654 {
1655         float *end = dst + 4*size;
1656         __m128 v = _mm_loadu_ps(src);
1657         while (dst < end)
1658         {
1659                 _mm_store_ps(dst, v);
1660                 dst += 4;
1661         }
1662 }
1663 #endif
1664
1665 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1666 {
1667 #ifdef SSE_POSSIBLE
1668         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1669         __m128 m0, m1, m2, m3;
1670         float *end;
1671         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1672         {
1673                 // fast case for identity matrix
1674                 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1675                 return;
1676         }
1677         end = out4f + numitems*4;
1678         m0 = _mm_loadu_ps(inmatrix16f);
1679         m1 = _mm_loadu_ps(inmatrix16f + 4);
1680         m2 = _mm_loadu_ps(inmatrix16f + 8);
1681         m3 = _mm_loadu_ps(inmatrix16f + 12);
1682         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1683         {
1684                 while (out4f < end)
1685                 {
1686                         __m128 v = _mm_loadu_ps(in4f);
1687                         _mm_store_ps(out4f,
1688                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1689                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1690                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1691                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1692                         out4f += 4;
1693                         in4f += 4;
1694                 }
1695         }
1696         else
1697         {
1698                 while (out4f < end)
1699                 {
1700                         __m128 v = _mm_load_ps(in4f);
1701                         _mm_store_ps(out4f,
1702                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1703                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1704                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1705                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1706                         out4f += 4;
1707                         in4f += 4;
1708                 }
1709         }
1710 #endif
1711 }
1712
1713 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1714 {
1715         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1716 }
1717
1718 #ifdef SSE_POSSIBLE
1719 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1720 { \
1721         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1722         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1723         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1724         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1725 }
1726
1727 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1728 { \
1729         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1730         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1731         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1732         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1733 }
1734
1735 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1736 { \
1737         __m128 p = (in); \
1738         out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1739                                                   _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1740                                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1741                                                                                         _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1742 }
1743
1744 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1745 {
1746         int clipmask = 0xFF;
1747         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1748         __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1749         __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1750         __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1751         m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1752         m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1753         m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1754         m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1755         #define BBFRONT(k, pos) \
1756         { \
1757                 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1758                 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1759                 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1760                 { \
1761                         __m128 proj; \
1762                         clipmask &= ~(1<<k); \
1763                         proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1764                         minproj = _mm_min_ss(minproj, proj); \
1765                         maxproj = _mm_max_ss(maxproj, proj); \
1766                 } \
1767         }
1768         BBFRONT(0, minpos); 
1769         BBFRONT(1, _mm_move_ss(minpos, maxpos)); 
1770         BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1771         BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1772         BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1773         BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1774         BBFRONT(6, _mm_move_ss(maxpos, minpos)); 
1775         BBFRONT(7, maxpos);
1776         #define BBCLIP(k) \
1777         { \
1778                 if (clipmask&(1<<k)) \
1779                 { \
1780                         if (!(clipmask&(1<<(k^1)))) \
1781                         { \
1782                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1783                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1784                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1785                                 minproj = _mm_min_ss(minproj, proj); \
1786                                 maxproj = _mm_max_ss(maxproj, proj); \
1787                         } \
1788                         if (!(clipmask&(1<<(k^2)))) \
1789                         { \
1790                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1791                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1792                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1793                                 minproj = _mm_min_ss(minproj, proj); \
1794                                 maxproj = _mm_max_ss(maxproj, proj); \
1795                         } \
1796                         if (!(clipmask&(1<<(k^4)))) \
1797                         { \
1798                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1799                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1800                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1801                                 minproj = _mm_min_ss(minproj, proj); \
1802                                 maxproj = _mm_max_ss(maxproj, proj); \
1803                         } \
1804                 } \
1805         }
1806         BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1807         viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1808         viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1809         minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1810         maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1811         minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1812         maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1813         *starty = _mm_cvttss_si32(maxproj);
1814         *endy = _mm_cvttss_si32(minproj)+1;
1815         return clipmask;
1816 }
1817         
1818 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1819 {
1820         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1821         float *end = out4f + numitems*4;
1822         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1823         __m128 minpos, maxpos;
1824         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1825         {
1826                 minpos = maxpos = _mm_loadu_ps(in4f);
1827                 while (out4f < end)
1828                 {
1829                         __m128 v = _mm_loadu_ps(in4f);
1830                         minpos = _mm_min_ps(minpos, v);
1831                         maxpos = _mm_max_ps(maxpos, v);
1832                         _mm_store_ps(out4f, v);
1833                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1834                         _mm_store_ps(screen4f, v);
1835                         in4f += 4;
1836                         out4f += 4;
1837                         screen4f += 4;
1838                 }
1839         }
1840         else
1841         {
1842                 minpos = maxpos = _mm_load_ps(in4f);
1843                 while (out4f < end)
1844                 {
1845                         __m128 v = _mm_load_ps(in4f);
1846                         minpos = _mm_min_ps(minpos, v);
1847                         maxpos = _mm_max_ps(maxpos, v);
1848                         _mm_store_ps(out4f, v);
1849                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1850                         _mm_store_ps(screen4f, v);
1851                         in4f += 4;
1852                         out4f += 4;
1853                         screen4f += 4;
1854                 }
1855         }
1856         if (starty && endy) 
1857         {
1858                 ALIGN(float minposf[4]);
1859                 ALIGN(float maxposf[4]);
1860                 _mm_store_ps(minposf, minpos);
1861                 _mm_store_ps(maxposf, maxpos);
1862                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1863         }
1864         return 0;
1865 }
1866
1867 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1868 {
1869         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1870         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1871         float *end;
1872         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1873                 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1874         end = out4f + numitems*4;
1875         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1876         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1877         m0 = _mm_loadu_ps(inmatrix16f);
1878         m1 = _mm_loadu_ps(inmatrix16f + 4);
1879         m2 = _mm_loadu_ps(inmatrix16f + 8);
1880         m3 = _mm_loadu_ps(inmatrix16f + 12);
1881         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1882         {
1883                 minpos = maxpos = _mm_loadu_ps(in4f);
1884                 while (out4f < end)
1885                 {
1886                         __m128 v = _mm_loadu_ps(in4f);
1887                         minpos = _mm_min_ps(minpos, v);
1888                         maxpos = _mm_max_ps(maxpos, v);
1889                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1890                         _mm_store_ps(out4f, v);
1891                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1892                         _mm_store_ps(screen4f, v);
1893                         in4f += 4;
1894                         out4f += 4;
1895                         screen4f += 4;
1896                 }
1897         }
1898         else
1899         {
1900                 minpos = maxpos = _mm_load_ps(in4f);
1901                 while (out4f < end)
1902                 {
1903                         __m128 v = _mm_load_ps(in4f);
1904                         minpos = _mm_min_ps(minpos, v);
1905                         maxpos = _mm_max_ps(maxpos, v);
1906                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1907                         _mm_store_ps(out4f, v);
1908                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1909                         _mm_store_ps(screen4f, v);
1910                         in4f += 4;
1911                         out4f += 4;
1912                         screen4f += 4;
1913                 }
1914         }
1915         if (starty && endy) 
1916         {
1917                 ALIGN(float minposf[4]);
1918                 ALIGN(float maxposf[4]);
1919                 _mm_store_ps(minposf, minpos);
1920                 _mm_store_ps(maxposf, maxpos);
1921                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f); 
1922         }
1923         return 0;
1924 }
1925 #endif
1926
1927 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1928 {
1929 #ifdef SSE_POSSIBLE
1930         float *outf = dpsoftrast.post_array4f[outarray];
1931         const unsigned char *inb;
1932         int firstvertex = dpsoftrast.firstvertex;
1933         int numvertices = dpsoftrast.numvertices;
1934         int stride;
1935         switch(inarray)
1936         {
1937         case DPSOFTRAST_ARRAY_POSITION:
1938                 stride = dpsoftrast.stride_vertex;
1939                 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1940                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1941                 break;
1942         case DPSOFTRAST_ARRAY_COLOR:
1943                 stride = dpsoftrast.stride_color;
1944                 if (dpsoftrast.pointer_color4f)
1945                 {
1946                         inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1947                         DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1948                 }
1949                 else if (dpsoftrast.pointer_color4ub)
1950                 {
1951                         stride = dpsoftrast.stride_color;
1952                         inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1953                         DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1954                 }
1955                 else
1956                 {
1957                         DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1958                 }
1959                 break;
1960         default:
1961                 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1962                 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1963                 {
1964                         inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1965                         switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1966                         {
1967                         case 2:
1968                                 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1969                                 break;
1970                         case 3:
1971                                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1972                                 break;
1973                         case 4:
1974                                 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1975                                 break;
1976                         }
1977                 }
1978                 break;
1979         }
1980         return outf;
1981 #else
1982         return NULL;
1983 #endif
1984 }
1985
1986 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1987 {
1988         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1989         DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1990         return data;
1991 }
1992
1993 #if 0
1994 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1995 {
1996 #ifdef SSE_POSSIBLE
1997         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1998         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1999         return data;
2000 #else
2001         return NULL;
2002 #endif
2003 }
2004 #endif
2005
2006 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
2007 {
2008 #ifdef SSE_POSSIBLE
2009         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2010         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
2011         return data;
2012 #else
2013         return NULL;
2014 #endif
2015 }
2016
2017 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
2018 {
2019         int x;
2020         int startx = span->startx;
2021         int endx = span->endx;
2022         float wslope = triangle->w[0];
2023         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
2024         float endz = 1.0f / (w + wslope * startx);
2025         if (triangle->w[0] == 0)
2026         {
2027                 // LordHavoc: fast flat polygons (HUD/menu)
2028                 for (x = startx;x < endx;x++)
2029                         zf[x] = endz;
2030                 return;
2031         }
2032         for (x = startx;x < endx;)
2033         {
2034                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2035                 float z = endz, dz;
2036                 if (nextsub >= endx) nextsub = endsub = endx-1;
2037                 endz = 1.0f / (w + wslope * nextsub);
2038                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
2039                 for (; x <= endsub; x++, z += dz)
2040                         zf[x] = z;
2041         }
2042 }
2043
2044 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2045 {
2046 #ifdef SSE_POSSIBLE
2047         int x;
2048         int startx = span->startx;
2049         int endx = span->endx;
2050         int maskx;
2051         int subx;
2052         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2053         unsigned char * RESTRICT pixelmask = span->pixelmask;
2054         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2055         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2056         if (!pixel)
2057                 return;
2058         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2059         pixeli += span->y * dpsoftrast.fb_width + span->x;
2060         // handle alphatest now (this affects depth writes too)
2061         if (thread->alphatest)
2062                 for (x = startx;x < endx;x++)
2063                         if (in4ub[x*4+3] < 128)
2064                                 pixelmask[x] = false;
2065         // LordHavoc: clear pixelmask for some pixels in alphablend cases, this
2066         // helps sprites, text and hud artwork
2067         switch(thread->fb_blendmode)
2068         {
2069         case DPSOFTRAST_BLENDMODE_ALPHA:
2070         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2071         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2072                 maskx = startx;
2073                 for (x = startx;x < endx;x++)
2074                 {
2075                         if (in4ub[x*4+3] >= 1)
2076                         {
2077                                 startx = x;
2078                                 for (;;)
2079                                 {
2080                                         while (++x < endx && in4ub[x*4+3] >= 1) ;
2081                                         maskx = x;
2082                                         if (x >= endx) break;
2083                                         ++x;
2084                                         while (++x < endx && in4ub[x*4+3] < 1) pixelmask[x] = false;
2085                                         if (x >= endx) break;
2086                                 }
2087                                 break;
2088                         }
2089                 }
2090                 endx = maskx;
2091                 break;
2092         case DPSOFTRAST_BLENDMODE_OPAQUE:
2093         case DPSOFTRAST_BLENDMODE_ADD:
2094         case DPSOFTRAST_BLENDMODE_INVMOD:
2095         case DPSOFTRAST_BLENDMODE_MUL:
2096         case DPSOFTRAST_BLENDMODE_MUL2:
2097         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2098         case DPSOFTRAST_BLENDMODE_INVADD:
2099                 break;
2100         }
2101         // put some special values at the end of the mask to ensure the loops end
2102         pixelmask[endx] = 1;
2103         pixelmask[endx+1] = 0;
2104         // LordHavoc: use a double loop to identify subspans, this helps the
2105         // optimized copy/blend loops to perform at their best, most triangles
2106         // have only one run of pixels, and do the search using wide reads...
2107         x = startx;
2108         while (x < endx)
2109         {
2110                 // if this pixel is masked off, it's probably not alone...
2111                 if (!pixelmask[x])
2112                 {
2113                         x++;
2114 #if 1
2115                         if (x + 8 < endx)
2116                         {
2117                                 // the 4-item search must be aligned or else it stalls badly
2118                                 if ((x & 3) && !pixelmask[x]) 
2119                                 {
2120                                         if(pixelmask[x]) goto endmasked;
2121                                         x++;
2122                                         if (x & 3)
2123                                         {
2124                                                 if(pixelmask[x]) goto endmasked;
2125                                                 x++;
2126                                                 if (x & 3)
2127                                                 {
2128                                                         if(pixelmask[x]) goto endmasked;
2129                                                         x++;
2130                                                 }
2131                                         }
2132                                 }
2133                                 while (*(unsigned int *)&pixelmask[x] == 0x00000000)
2134                                         x += 4;
2135                         }
2136 #endif
2137                         for (;!pixelmask[x];x++)
2138                                 ;
2139                         // rather than continue the loop, just check the end variable
2140                         if (x >= endx)
2141                                 break;
2142                 }
2143         endmasked:
2144                 // find length of subspan
2145                 subx = x + 1;
2146 #if 1
2147                 if (subx + 8 < endx)
2148                 {
2149                         if (subx & 3)
2150                         {
2151                                 if(!pixelmask[subx]) goto endunmasked;
2152                                 subx++;
2153                                 if (subx & 3)
2154                                 {
2155                                         if(!pixelmask[subx]) goto endunmasked;
2156                                         subx++;
2157                                         if (subx & 3)
2158                                         {
2159                                                 if(!pixelmask[subx]) goto endunmasked;
2160                                                 subx++;
2161                                         }
2162                                 }
2163                         }
2164                         while (*(unsigned int *)&pixelmask[subx] == 0x01010101)
2165                                 subx += 4;
2166                 }
2167 #endif
2168                 for (;pixelmask[subx];subx++)
2169                         ;
2170                 // the checks can overshoot, so make sure to clip it...
2171                 if (subx > endx)
2172                         subx = endx;
2173         endunmasked:
2174                 // now that we know the subspan length...  process!
2175                 switch(thread->fb_blendmode)
2176                 {
2177                 case DPSOFTRAST_BLENDMODE_OPAQUE:
2178 #if 0
2179                         if (subx - x >= 16)
2180                         {
2181                                 memcpy(pixeli + x, ini + x, (subx - x) * sizeof(pixeli[x]));
2182                                 x = subx;
2183                         }
2184                         else
2185 #elif 1
2186                         while (x + 16 <= subx)
2187                         {
2188                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2189                                 _mm_storeu_si128((__m128i *)&pixeli[x+4], _mm_loadu_si128((const __m128i *)&ini[x+4]));
2190                                 _mm_storeu_si128((__m128i *)&pixeli[x+8], _mm_loadu_si128((const __m128i *)&ini[x+8]));
2191                                 _mm_storeu_si128((__m128i *)&pixeli[x+12], _mm_loadu_si128((const __m128i *)&ini[x+12]));
2192                                 x += 16;
2193                         }
2194 #endif
2195                         {
2196                                 while (x + 4 <= subx)
2197                                 {
2198                                         _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2199                                         x += 4;
2200                                 }
2201                                 if (x + 2 <= subx)
2202                                 {
2203                                         pixeli[x] = ini[x];
2204                                         pixeli[x+1] = ini[x+1];
2205                                         x += 2;
2206                                 }
2207                                 if (x < subx)
2208                                 {
2209                                         pixeli[x] = ini[x];
2210                                         x++;
2211                                 }
2212                         }
2213                         break;
2214                 case DPSOFTRAST_BLENDMODE_ALPHA:
2215                 #define FINISHBLEND(blend2, blend1) \
2216                         for (;x + 1 < subx;x += 2) \
2217                         { \
2218                                 __m128i src, dst; \
2219                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2220                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2221                                 blend2; \
2222                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2223                         } \
2224                         if (x < subx) \
2225                         { \
2226                                 __m128i src, dst; \
2227                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2228                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2229                                 blend1; \
2230                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2231                                 x++; \
2232                         }
2233                         FINISHBLEND({
2234                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2235                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2236                         }, {
2237                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2238                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2239                         });
2240                         break;
2241                 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2242                         FINISHBLEND({
2243                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2244                                 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2245                         }, {
2246                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2247                                 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2248                         });
2249                         break;
2250                 case DPSOFTRAST_BLENDMODE_ADD:
2251                         FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2252                         break;
2253                 case DPSOFTRAST_BLENDMODE_INVMOD:
2254                         FINISHBLEND({
2255                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2256                         }, {
2257                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2258                         });
2259                         break;
2260                 case DPSOFTRAST_BLENDMODE_MUL:
2261                         FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2262                         break;
2263                 case DPSOFTRAST_BLENDMODE_MUL2:
2264                         FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2265                         break;
2266                 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2267                         FINISHBLEND({
2268                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2269                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2270                         }, {
2271                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2272                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2273                         });
2274                         break;
2275                 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2276                         FINISHBLEND({
2277                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2278                                 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2279                         }, {
2280                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2281                                 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2282                         });
2283                         break;
2284                 case DPSOFTRAST_BLENDMODE_INVADD:
2285                         FINISHBLEND({
2286                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2287                         }, {
2288                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2289                         });
2290                         break;
2291                 }
2292         }
2293 #endif
2294 }
2295
2296 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2297 {
2298         int x;
2299         int startx = span->startx;
2300         int endx = span->endx;
2301         int flags;
2302         float c[4];
2303         float data[4];
2304         float slope[4];
2305         float tc[2], endtc[2];
2306         float tcscale[2];
2307         unsigned int tci[2];
2308         unsigned int tci1[2];
2309         unsigned int tcimin[2];
2310         unsigned int tcimax[2];
2311         int tciwrapmask[2];
2312         int tciwidth;
2313         int filter;
2314         int mip;
2315         const unsigned char * RESTRICT pixelbase;
2316         const unsigned char * RESTRICT pixel[4];
2317         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2318         // if no texture is bound, just fill it with white
2319         if (!texture)
2320         {
2321                 for (x = startx;x < endx;x++)
2322                 {
2323                         out4f[x*4+0] = 1.0f;
2324                         out4f[x*4+1] = 1.0f;
2325                         out4f[x*4+2] = 1.0f;
2326                         out4f[x*4+3] = 1.0f;
2327                 }
2328                 return;
2329         }
2330         mip = triangle->mip[texunitindex];
2331         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2332         // if this mipmap of the texture is 1 pixel, just fill it with that color
2333         if (texture->mipmap[mip][1] == 4)
2334         {
2335                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2336                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2337                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2338                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2339                 for (x = startx;x < endx;x++)
2340                 {
2341                         out4f[x*4+0] = c[0];
2342                         out4f[x*4+1] = c[1];
2343                         out4f[x*4+2] = c[2];
2344                         out4f[x*4+3] = c[3];
2345                 }
2346                 return;
2347         }
2348         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2349         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2350         flags = texture->flags;
2351         tcscale[0] = texture->mipmap[mip][2];
2352         tcscale[1] = texture->mipmap[mip][3];
2353         tciwidth = texture->mipmap[mip][2];
2354         tcimin[0] = 0;
2355         tcimin[1] = 0;
2356         tcimax[0] = texture->mipmap[mip][2]-1;
2357         tcimax[1] = texture->mipmap[mip][3]-1;
2358         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2359         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2360         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
2361         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
2362         if (filter)
2363         {
2364                 endtc[0] -= 0.5f;
2365                 endtc[1] -= 0.5f;
2366         }
2367         for (x = startx;x < endx;)
2368         {
2369                 unsigned int subtc[2];
2370                 unsigned int substep[2];
2371                 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2372                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2373                 if (nextsub >= endx)
2374                 {
2375                         nextsub = endsub = endx-1;      
2376                         if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2377                 }
2378                 tc[0] = endtc[0];
2379                 tc[1] = endtc[1];
2380                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2381                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2382                 if (filter)
2383                 {
2384                         endtc[0] -= 0.5f;
2385                         endtc[1] -= 0.5f;
2386                 }
2387                 substep[0] = (endtc[0] - tc[0]) * subscale;
2388                 substep[1] = (endtc[1] - tc[1]) * subscale;
2389                 subtc[0] = tc[0] * (1<<12);
2390                 subtc[1] = tc[1] * (1<<12);
2391                 if (filter)
2392                 {
2393                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2394                         {
2395                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2396                                 {
2397                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2398                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2399                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2400                                         tci[0] = subtc[0]>>12;
2401                                         tci[1] = subtc[1]>>12;
2402                                         tci1[0] = tci[0] + 1;
2403                                         tci1[1] = tci[1] + 1;
2404                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2405                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2406                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2407                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2408                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2409                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2410                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2411                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2412                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2413                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2414                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2415                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2416                                         out4f[x*4+0] = c[0];
2417                                         out4f[x*4+1] = c[1];
2418                                         out4f[x*4+2] = c[2];
2419                                         out4f[x*4+3] = c[3];
2420                                 }
2421                         }
2422                         else
2423                         {
2424                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2425                                 {
2426                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2427                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2428                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2429                                         tci[0] = subtc[0]>>12;
2430                                         tci[1] = subtc[1]>>12;
2431                                         tci1[0] = tci[0] + 1;
2432                                         tci1[1] = tci[1] + 1;
2433                                         tci[0] &= tciwrapmask[0];
2434                                         tci[1] &= tciwrapmask[1];
2435                                         tci1[0] &= tciwrapmask[0];
2436                                         tci1[1] &= tciwrapmask[1];
2437                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2438                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2439                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2440                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2441                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2442                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2443                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2444                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2445                                         out4f[x*4+0] = c[0];
2446                                         out4f[x*4+1] = c[1];
2447                                         out4f[x*4+2] = c[2];
2448                                         out4f[x*4+3] = c[3];
2449                                 }
2450                         }
2451                 }
2452                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2453                 {
2454                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2455                         {
2456                                 tci[0] = subtc[0]>>12;
2457                                 tci[1] = subtc[1]>>12;
2458                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2459                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2460                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2461                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2462                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2463                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2464                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2465                                 out4f[x*4+0] = c[0];
2466                                 out4f[x*4+1] = c[1];
2467                                 out4f[x*4+2] = c[2];
2468                                 out4f[x*4+3] = c[3];
2469                         }
2470                 }
2471                 else
2472                 {
2473                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2474                         {
2475                                 tci[0] = subtc[0]>>12;
2476                                 tci[1] = subtc[1]>>12;
2477                                 tci[0] &= tciwrapmask[0];
2478                                 tci[1] &= tciwrapmask[1];
2479                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2480                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2481                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2482                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2483                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2484                                 out4f[x*4+0] = c[0];
2485                                 out4f[x*4+1] = c[1];
2486                                 out4f[x*4+2] = c[2];
2487                                 out4f[x*4+3] = c[3];
2488                         }
2489                 }
2490         }
2491 }
2492
2493 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2494 {
2495 #ifdef SSE_POSSIBLE
2496         int x;
2497         int startx = span->startx;
2498         int endx = span->endx;
2499         int flags;
2500         __m128 data, slope, tcscale;
2501         __m128i tcsize, tcmask, tcoffset, tcmax;
2502         __m128 tc, endtc;
2503         __m128i subtc, substep, endsubtc;
2504         int filter;
2505         int mip;
2506         int affine; // LordHavoc: optimized affine texturing case
2507         unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2508         const unsigned char * RESTRICT pixelbase;
2509         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2510         // if no texture is bound, just fill it with white
2511         if (!texture)
2512         {
2513                 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2514                 return;
2515         }
2516         mip = triangle->mip[texunitindex];
2517         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2518         // if this mipmap of the texture is 1 pixel, just fill it with that color
2519         if (texture->mipmap[mip][1] == 4)
2520         {
2521                 unsigned int k = *((const unsigned int *)pixelbase);
2522                 for (x = startx;x < endx;x++)
2523                         outi[x] = k;
2524                 return;
2525         }
2526         affine = zf[startx] == zf[endx-1];
2527         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2528         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2529         flags = texture->flags;
2530         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2531         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2532         tcscale = _mm_cvtepi32_ps(tcsize);
2533         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2534         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2535         endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2536         if (filter)
2537                 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2538         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2539         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2540         tcmax = _mm_packs_epi32(tcmask, tcmask);
2541         for (x = startx;x < endx;)
2542         {
2543                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2544                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2545                 if (nextsub >= endx || affine)
2546                 {
2547                         nextsub = endsub = endx-1;
2548                         if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2549                 }       
2550                 tc = endtc;
2551                 subtc = endsubtc;
2552                 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2553                 if (filter)
2554                         endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2555                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2556                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2557                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2558                 substep = _mm_slli_epi32(substep, 1);
2559                 if (filter)
2560                 {
2561                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2562                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2563                         {
2564                                 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2565                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2566                                 {
2567                                         const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;                   
2568                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2569                                         tci = _mm_madd_epi16(tci, tcoffset);
2570                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2571                                         ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2572                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2573                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2574                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2575                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2576                                         fracm = _mm_srli_epi16(subtc, 1);
2577                                         pix1 = _mm_add_epi16(pix1,
2578                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2579                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2580                                         pix3 = _mm_add_epi16(pix3,
2581                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2582                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2583                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2584                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2585                                         pix2 = _mm_add_epi16(pix2,
2586                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2587                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2588                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2589                                 }
2590                                 if (x <= endsub)
2591                                 {
2592                                         const unsigned char * RESTRICT ptr1;
2593                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2594                                         tci = _mm_madd_epi16(tci, tcoffset);
2595                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2596                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2597                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2598                                         fracm = _mm_srli_epi16(subtc, 1);
2599                                         pix1 = _mm_add_epi16(pix1,
2600                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2601                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2602                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2603                                         pix1 = _mm_add_epi16(pix1,
2604                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2605                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2606                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2607                                         x++;
2608                                 }
2609                         }
2610                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2611                         {
2612                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2613                                 {
2614                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2615                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2616                                         tci = _mm_madd_epi16(tci, tcoffset);
2617                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2618                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2619                                                                                         _mm_setzero_si128());
2620                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2621                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2622                                                                                         _mm_setzero_si128());
2623                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2624                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2625                                         tci = _mm_madd_epi16(tci, tcoffset);
2626                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2627                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2628                                                                                         _mm_setzero_si128());
2629                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2630                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2631                                                                                         _mm_setzero_si128());
2632                                         fracm = _mm_srli_epi16(subtc, 1);
2633                                         pix1 = _mm_add_epi16(pix1,
2634                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2635                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2636                                         pix3 = _mm_add_epi16(pix3,
2637                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2638                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2639                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2640                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2641                                         pix2 = _mm_add_epi16(pix2,
2642                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2643                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2644                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2645                                 }
2646                                 if (x <= endsub)
2647                                 {
2648                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2649                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2650                                         tci = _mm_madd_epi16(tci, tcoffset);
2651                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2652                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2653                                                                                         _mm_setzero_si128());
2654                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2655                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2656                                                                                         _mm_setzero_si128());
2657                                         fracm = _mm_srli_epi16(subtc, 1);
2658                                         pix1 = _mm_add_epi16(pix1,
2659                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2660                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2661                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2662                                         pix1 = _mm_add_epi16(pix1,
2663                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2664                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2665                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2666                                         x++;
2667                                 }
2668                         }
2669                         else
2670                         {
2671                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2672                                 {
2673                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2674                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2675                                         tci = _mm_madd_epi16(tci, tcoffset);
2676                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2677                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2678                                                                                         _mm_setzero_si128());
2679                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2680                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2681                                                                                         _mm_setzero_si128());
2682                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2683                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2684                                         tci = _mm_madd_epi16(tci, tcoffset);
2685                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2686                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2687                                                                                         _mm_setzero_si128());
2688                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2689                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2690                                                                                         _mm_setzero_si128());
2691                                         fracm = _mm_srli_epi16(subtc, 1);
2692                                         pix1 = _mm_add_epi16(pix1,
2693                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2694                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2695                                         pix3 = _mm_add_epi16(pix3,
2696                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2697                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2698                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2699                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2700                                         pix2 = _mm_add_epi16(pix2,
2701                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2702                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2703                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2704                                 }
2705                                 if (x <= endsub)
2706                                 {
2707                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2708                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2709                                         tci = _mm_madd_epi16(tci, tcoffset);
2710                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2711                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2712                                                                                         _mm_setzero_si128());
2713                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2714                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2715                                                                                         _mm_setzero_si128());
2716                                         fracm = _mm_srli_epi16(subtc, 1);
2717                                         pix1 = _mm_add_epi16(pix1,
2718                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2719                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2720                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2721                                         pix1 = _mm_add_epi16(pix1,
2722                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2723                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2724                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2725                                         x++;
2726                                 }
2727                         }
2728                 }
2729                 else
2730                 {
2731                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2732                         {
2733                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2734                                 {
2735                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2736                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2737                                         tci = _mm_madd_epi16(tci, tcoffset);
2738                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2739                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2740                                 }
2741                                 if (x <= endsub)
2742                                 {
2743                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2744                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2745                                         tci = _mm_madd_epi16(tci, tcoffset);
2746                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2747                                         x++;
2748                                 }
2749                         }
2750                         else
2751                         {
2752                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2753                                 {
2754                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2755                                         tci = _mm_and_si128(tci, tcmax); 
2756                                         tci = _mm_madd_epi16(tci, tcoffset);
2757                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2758                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2759                                 }
2760                                 if (x <= endsub)
2761                                 {
2762                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2763                                         tci = _mm_and_si128(tci, tcmax); 
2764                                         tci = _mm_madd_epi16(tci, tcoffset);
2765                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2766                                         x++;
2767                                 }
2768                         }
2769                 }
2770         }
2771 #endif
2772 }
2773
2774 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2775 {
2776         // TODO: IMPLEMENT
2777         memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2778 }
2779
2780 float DPSOFTRAST_SampleShadowmap(const float *vector)
2781 {
2782         // TODO: IMPLEMENT
2783         return 1.0f;
2784 }
2785
2786 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2787 {
2788         int x;
2789         int startx = span->startx;
2790         int endx = span->endx;
2791         float c[4];
2792         float data[4];
2793         float slope[4];
2794         float z;
2795         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2796         for (x = startx;x < endx;x++)
2797         {
2798                 z = zf[x];
2799                 c[0] = (data[0] + slope[0]*x) * z;
2800                 c[1] = (data[1] + slope[1]*x) * z;
2801                 c[2] = (data[2] + slope[2]*x) * z;
2802                 c[3] = (data[3] + slope[3]*x) * z;
2803                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2804                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2805                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2806                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2807         }
2808 }
2809
2810 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2811 {
2812         int x;
2813         int startx = span->startx;
2814         int endx = span->endx;
2815         float c[4];
2816         float data[4];
2817         float slope[4];
2818         float z;
2819         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2820         for (x = startx;x < endx;x++)
2821         {
2822                 z = zf[x];
2823                 c[0] = (data[0] + slope[0]*x) * z;
2824                 c[1] = (data[1] + slope[1]*x) * z;
2825                 c[2] = (data[2] + slope[2]*x) * z;
2826                 c[3] = (data[3] + slope[3]*x) * z;
2827                 out4f[x*4+0] = c[0];
2828                 out4f[x*4+1] = c[1];
2829                 out4f[x*4+2] = c[2];
2830                 out4f[x*4+3] = c[3];
2831         }
2832 }
2833
2834 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2835 {
2836         int x, startx = span->startx, endx = span->endx;
2837         float c[4], localcolor[4];
2838         localcolor[0] = subcolor[0];
2839         localcolor[1] = subcolor[1];
2840         localcolor[2] = subcolor[2];
2841         localcolor[3] = subcolor[3];
2842         for (x = startx;x < endx;x++)
2843         {
2844                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2845                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2846                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2847                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2848                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2849                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2850                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2851                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2852         }
2853 }
2854
2855 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2856 {
2857         int x, startx = span->startx, endx = span->endx;
2858         for (x = startx;x < endx;x++)
2859         {
2860                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2861                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2862                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2863                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2864         }
2865 }
2866
2867 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2868 {
2869         int x, startx = span->startx, endx = span->endx;
2870         for (x = startx;x < endx;x++)
2871         {
2872                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2873                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2874                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2875                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2876         }
2877 }
2878
2879 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2880 {
2881         int x, startx = span->startx, endx = span->endx;
2882         float a, b;
2883         for (x = startx;x < endx;x++)
2884         {
2885                 a = 1.0f - inb4f[x*4+3];
2886                 b = inb4f[x*4+3];
2887                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2888                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2889                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2890                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2891         }
2892 }
2893
2894 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2895 {
2896         int x, startx = span->startx, endx = span->endx;
2897         float localcolor[4], ilerp, lerp;
2898         localcolor[0] = color[0];
2899         localcolor[1] = color[1];
2900         localcolor[2] = color[2];
2901         localcolor[3] = color[3];
2902         ilerp = 1.0f - localcolor[3];
2903         lerp = localcolor[3];
2904         for (x = startx;x < endx;x++)
2905         {
2906                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2907                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2908                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2909                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2910         }
2911 }
2912
2913
2914
2915 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2916 {
2917 #ifdef SSE_POSSIBLE
2918         int x;
2919         int startx = span->startx;
2920         int endx = span->endx;
2921         __m128 data, slope;
2922         __m128 mod, endmod;
2923         __m128i submod, substep, endsubmod;
2924         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2925         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2926         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2927         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2928         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2929         for (x = startx; x < endx;)
2930         {
2931                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2932                 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2933                 if (nextsub >= endx)
2934                 {
2935                         nextsub = endsub = endx-1;
2936                         if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2937                 }
2938                 mod = endmod;
2939                 submod = endsubmod;
2940                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2941                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2942                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2943                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2944                 substep = _mm_packs_epi32(substep, substep);
2945                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2946                 {
2947                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2948                         pix = _mm_mulhi_epu16(pix, submod);
2949                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2950                 }
2951                 if (x <= endsub)
2952                 {
2953                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2954                         pix = _mm_mulhi_epu16(pix, submod);
2955                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2956                         x++;
2957                 }
2958         }
2959 #endif
2960 }
2961
2962 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2963 {
2964 #ifdef SSE_POSSIBLE
2965         int x;
2966         int startx = span->startx;
2967         int endx = span->endx;
2968         __m128 data, slope;
2969         __m128 mod, endmod;
2970         __m128i submod, substep, endsubmod;
2971         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2972         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2973         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2974         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2975         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2976         for (x = startx; x < endx;)
2977         {
2978                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2979                 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2980                 if (nextsub >= endx)
2981                 {
2982                         nextsub = endsub = endx-1;
2983                         if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
2984                 }
2985                 mod = endmod;
2986                 submod = endsubmod;
2987                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2988                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2989                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2990                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2991                 substep = _mm_packs_epi32(substep, substep);
2992                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2993                 {
2994                         __m128i pix = _mm_srai_epi16(submod, 4);
2995                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2996                 }
2997                 if (x <= endsub)
2998                 {
2999                         __m128i pix = _mm_srai_epi16(submod, 4);
3000                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3001                         x++;
3002                 }
3003         }
3004 #endif
3005 }
3006
3007 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3008 {
3009 #ifdef SSE_POSSIBLE
3010         int x, startx = span->startx, endx = span->endx;
3011         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3012         localcolor = _mm_packs_epi32(localcolor, localcolor);
3013         for (x = startx;x+2 <= endx;x+=2)
3014         {
3015                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3016                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3017                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3018                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3019         }
3020         if (x < endx)
3021         {
3022                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3023                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3024                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3025                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3026         }
3027 #endif
3028 }
3029
3030 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3031 {
3032 #ifdef SSE_POSSIBLE
3033         int x, startx = span->startx, endx = span->endx;
3034         for (x = startx;x+2 <= endx;x+=2)
3035         {
3036                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3037                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3038                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3039                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3040         }
3041         if (x < endx)
3042         {
3043                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3044                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3045                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3046                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3047         }
3048 #endif
3049 }
3050
3051 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3052 {
3053 #ifdef SSE_POSSIBLE
3054         int x, startx = span->startx, endx = span->endx;
3055         for (x = startx;x+2 <= endx;x+=2)
3056         {
3057                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3058                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3059                 pix1 = _mm_add_epi16(pix1, pix2);
3060                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3061         }
3062         if (x < endx)
3063         {
3064                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3065                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3066                 pix1 = _mm_add_epi16(pix1, pix2);
3067                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3068         }
3069 #endif
3070 }
3071
3072 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3073 {
3074 #ifdef SSE_POSSIBLE
3075         int x, startx = span->startx, endx = span->endx;
3076         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3077         tint = _mm_packs_epi32(tint, tint);
3078         for (x = startx;x+2 <= endx;x+=2)
3079         {
3080                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3081                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3082                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3083                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3084         }
3085         if (x < endx)
3086         {
3087                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3088                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3089                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3090                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3091         }
3092 #endif
3093 }
3094
3095 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3096 {
3097 #ifdef SSE_POSSIBLE
3098         int x, startx = span->startx, endx = span->endx;
3099         for (x = startx;x+2 <= endx;x+=2)
3100         {
3101                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3102                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3103                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3104                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3105                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3106         }
3107         if (x < endx)
3108         {
3109                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3110                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3111                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3112                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3113                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3114         }
3115 #endif
3116 }
3117
3118 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3119 {
3120 #ifdef SSE_POSSIBLE
3121         int x, startx = span->startx, endx = span->endx;
3122         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3123         localcolor = _mm_packs_epi32(localcolor, localcolor);
3124         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3125         for (x = startx;x+2 <= endx;x+=2)
3126         {
3127                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3128                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3129                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3130         }
3131         if (x < endx)
3132         {
3133                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3134                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3135                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3136         }
3137 #endif
3138 }
3139
3140
3141
3142 void DPSOFTRAST_VertexShader_Generic(void)
3143 {
3144         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3145         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3146         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3147         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3148                 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3149 }
3150
3151 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3152 {
3153         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3154         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3155         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3156         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3157         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3158         if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3159         {
3160                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3161                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3162                 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3163                 {
3164                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3165                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3166                         {
3167                                 // multiply
3168                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3169                         }
3170                         else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3171                         {
3172                                 // add
3173                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3174                         }
3175                         else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3176                         {
3177                                 // alphablend
3178                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3179                         }
3180                 }
3181         }
3182         else
3183                 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3184         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3185 }
3186
3187
3188
3189 void DPSOFTRAST_VertexShader_PostProcess(void)
3190 {
3191         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3192         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3193         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3194 }
3195
3196 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3197 {
3198         // TODO: optimize!!  at the very least there is no reason to use texture sampling on the frame texture
3199         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3200         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3201         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3202         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3203         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3204         if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3205         {
3206                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3207                 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3208         }
3209         DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3210         if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3211         {
3212                 // TODO: implement saturation
3213         }
3214         if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3215         {
3216                 // TODO: implement gammaramps
3217         }
3218         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3219 }
3220
3221
3222
3223 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3224 {
3225         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3226 }
3227
3228 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3229 {
3230         // this is never called (because colormask is off when this shader is used)
3231         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3232         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3233         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3234         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3235         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3236 }
3237
3238
3239
3240 void DPSOFTRAST_VertexShader_FlatColor(void)
3241 {
3242         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3243         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3244 }
3245
3246 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3247 {
3248 #ifdef SSE_POSSIBLE
3249         unsigned char * RESTRICT pixelmask = span->pixelmask;
3250         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3251         int x, startx = span->startx, endx = span->endx;
3252         __m128i Color_Ambientm;
3253         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3254         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3255         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3256         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3257         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3258         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3259                 pixel = buffer_FragColorbgra8;
3260         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3261         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3262         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3263         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3264         for (x = startx;x < endx;x++)
3265         {
3266                 __m128i color, pix;
3267                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3268                 {
3269                         __m128i pix2;
3270                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3271                         pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3272                         pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3273                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3274                         x += 3;
3275                         continue;
3276                 }
3277                 if (!pixelmask[x])
3278                         continue;
3279                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3280                 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3281                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3282         }
3283         if (pixel == buffer_FragColorbgra8)
3284                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3285 #endif
3286 }
3287
3288
3289
3290 void DPSOFTRAST_VertexShader_VertexColor(void)
3291 {
3292         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3293         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3294         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3295 }
3296
3297 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3298 {
3299 #ifdef SSE_POSSIBLE
3300         unsigned char * RESTRICT pixelmask = span->pixelmask;
3301         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3302         int x, startx = span->startx, endx = span->endx;
3303         __m128i Color_Ambientm, Color_Diffusem;
3304         __m128 data, slope;
3305         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3306         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3307         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3308         int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3309         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3310         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3311         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3312                 pixel = buffer_FragColorbgra8;
3313         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3314         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3315         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3316         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3317         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3318         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3319         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3320         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3321         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3322         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3323         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3324         data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3325         slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3326         for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3327         {
3328                 __m128i color, mod, pix;
3329                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3330                 {
3331                         __m128i pix2, mod2;
3332                         __m128 z = _mm_loadu_ps(&buffer_z[x]);
3333                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3334                         mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3335                         data = _mm_add_ps(data, slope);
3336                         mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3337                         data = _mm_add_ps(data, slope);
3338                         mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3339                         data = _mm_add_ps(data, slope);
3340                         mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3341                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3342                                                                   _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3343                         pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3344                                                                    _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3345                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3346                         x += 3;
3347                         continue;
3348                 }
3349                 if (!pixelmask[x])
3350                         continue;
3351                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3352                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
3353                 mod = _mm_packs_epi32(mod, mod);
3354                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3355                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3356         }
3357         if (pixel == buffer_FragColorbgra8)
3358                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3359 #endif
3360 }
3361
3362
3363
3364 void DPSOFTRAST_VertexShader_Lightmap(void)
3365 {
3366         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3367         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3368         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3369 }
3370
3371 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3372 {
3373 #ifdef SSE_POSSIBLE
3374         unsigned char * RESTRICT pixelmask = span->pixelmask;
3375         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3376         int x, startx = span->startx, endx = span->endx;
3377         __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3378         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3379         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3380         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3381         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3382         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3383         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3384         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3385         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3386         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3387                 pixel = buffer_FragColorbgra8;
3388         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3389         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3390         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3391         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3392         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3393         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3394         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3395         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3396         {
3397                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3398                 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3399                 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3400                 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3401                 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3402                 for (x = startx;x < endx;x++)
3403                 {
3404                         __m128i color, lightmap, glow, pix;
3405                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3406                         {
3407                                 __m128i pix2;
3408                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3409                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3410                                 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3411                                 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3412                                                                                                         _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3413                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3414                                 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3415                                                                                                         _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3416                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3417                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3418                                 x += 3;
3419                                 continue;
3420                         }
3421                         if (!pixelmask[x])
3422                                 continue;
3423                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3424                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3425                         glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3426                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3427                         pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3428                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3429                 }
3430         }
3431         else
3432         {
3433                 for (x = startx;x < endx;x++)
3434                 {
3435                         __m128i color, lightmap, pix;
3436                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3437                         {
3438                                 __m128i pix2;
3439                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3440                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3441                                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3442                                                                           _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3443                                 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3444                                                                            _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3445                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3446                                 x += 3;
3447                                 continue;
3448                         }
3449                         if (!pixelmask[x]) 
3450                                 continue;
3451                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3452                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3453                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3454                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3455                 }
3456         }
3457         if (pixel == buffer_FragColorbgra8)
3458                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3459 #endif
3460 }
3461
3462
3463 void DPSOFTRAST_VertexShader_LightDirection(void);
3464 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3465
3466 void DPSOFTRAST_VertexShader_FakeLight(void)
3467 {
3468         DPSOFTRAST_VertexShader_LightDirection();
3469 }
3470
3471 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3472 {
3473         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3474 }
3475
3476
3477
3478 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3479 {
3480         DPSOFTRAST_VertexShader_LightDirection();
3481         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3482 }
3483
3484 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3485 {
3486         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3487 }
3488
3489
3490
3491 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3492 {
3493         DPSOFTRAST_VertexShader_LightDirection();
3494         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3495 }
3496
3497 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3498 {
3499         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3500 }
3501
3502
3503
3504 void DPSOFTRAST_VertexShader_LightDirection(void)
3505 {
3506         int i;
3507         int numvertices = dpsoftrast.numvertices;
3508         float LightDir[4];
3509         float LightVector[4];
3510         float EyePosition[4];
3511         float EyeVectorModelSpace[4];
3512         float EyeVector[4];
3513         float position[4];
3514         float svector[4];
3515         float tvector[4];
3516         float normal[4];
3517         LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3518         LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3519         LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3520         LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3521         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3522         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3523         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3524         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3525         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3526         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3527         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3528         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3529         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3530         for (i = 0;i < numvertices;i++)
3531         {
3532                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3533                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3534                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3535                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3536                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3537                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3538                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3539                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3540                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3541                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3542                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3543                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3544                 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3545                 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3546                 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3547                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3548                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3549                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3550                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3551                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3552                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3553                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3554                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3555                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3556                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3557                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3558                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3559                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3560                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3561         }
3562         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3563 }
3564
3565 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3566 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3567 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3568 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3569 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3570 #define DPSOFTRAST_Vector3Normalize(v)\
3571 do\
3572 {\
3573         float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3574         if (len)\
3575         {\
3576                 len = 1.0f / len;\
3577                 v[0] *= len;\
3578                 v[1] *= len;\
3579                 v[2] *= len;\
3580         }\
3581 }\
3582 while(0)
3583
3584 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3585 {
3586         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3587         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3588         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3589         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3590         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3591         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3592         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3593         unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3594         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3595         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3596         int x, startx = span->startx, endx = span->endx;
3597         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3598         float LightVectordata[4];
3599         float LightVectorslope[4];
3600         float EyeVectordata[4];
3601         float EyeVectorslope[4];
3602         float VectorSdata[4];
3603         float VectorSslope[4];
3604         float VectorTdata[4];
3605         float VectorTslope[4];
3606         float VectorRdata[4];
3607         float VectorRslope[4];
3608         float z;
3609         float diffusetex[4];
3610         float glosstex[4];
3611         float surfacenormal[4];
3612         float lightnormal[4];
3613         float lightnormal_modelspace[4];
3614         float eyenormal[4];
3615         float specularnormal[4];
3616         float diffuse;
3617         float specular;
3618         float SpecularPower;
3619         int d[4];
3620         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3621         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3622         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3623         Color_Glow[3] = 0.0f;
3624         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3625         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3626         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3627         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3628         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3629         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3630         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3631         Color_Pants[3] = 0.0f;
3632         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3633         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3634         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3635         Color_Shirt[3] = 0.0f;
3636         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3637         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3638         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3639         {
3640                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3641                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3642         }
3643         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3644         {
3645                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3646         }
3647         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3648         {
3649                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3650                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3651                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3652                 Color_Diffuse[3] = 0.0f;
3653                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3654                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3655                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3656                 LightColor[3] = 0.0f;
3657                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3658                 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3659                 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3660                 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3661                 Color_Specular[3] = 0.0f;
3662                 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3663                 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3664                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3665
3666                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3667                 {
3668                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3669                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3670                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3671                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3672                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3673                 }
3674                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3675                 {
3676                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3677                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3678                 }
3679                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3680                 {
3681                         // nothing of this needed
3682                 }
3683                 else
3684                 {
3685                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3686                 }
3687
3688                 for (x = startx;x < endx;x++)
3689                 {
3690                         z = buffer_z[x];
3691                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3692                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3693                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3694                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3695                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3696                         {
3697                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3698                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3699                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3700                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3701                         }
3702                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3703                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3704                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3705                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3706                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3707                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3708                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3709                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3710
3711                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3712                         {
3713                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3714                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3715                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3716                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3717
3718                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3719                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3720                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3721                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3722
3723                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3724                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3725                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3726                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3727
3728                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3729                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3730                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3731                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3732
3733                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3734                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3735
3736                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3737                                 {
3738                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3739                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3740                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3741                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3742                                 }
3743                         }
3744                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3745                         {
3746                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3747                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3748                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3749                                 {
3750                                         float f = 1.0f / 256.0f;
3751                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3752                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3753                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3754                                 }
3755                         }
3756                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3757                         {
3758                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3759                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3760                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3761                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3762
3763                                 LightColor[0] = 1.0;
3764                                 LightColor[1] = 1.0;
3765                                 LightColor[2] = 1.0;
3766                         }
3767                         else
3768                         {
3769                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3770                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3771                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3772                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3773                         }
3774
3775                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3776
3777                         if(thread->shader_exactspecularmath)
3778                         {
3779                                 // reflect lightnormal at surfacenormal, take the negative of that
3780                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3781                                 float f;
3782                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3783                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3784                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3785                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3786
3787                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
3788                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3789                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3790                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3791                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3792
3793                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3794                         }
3795                         else
3796                         {
3797                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3798                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3799                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3800                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3801
3802                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
3803                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
3804                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
3805                                 DPSOFTRAST_Vector3Normalize(specularnormal);
3806
3807                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3808                         }
3809
3810                         specular = pow(specular, SpecularPower * glosstex[3]);
3811                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3812                         {
3813                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3814                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3815                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3816                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3817                         }
3818                         else
3819                         {
3820                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3821                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3822                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3823                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3824                         }
3825
3826                         buffer_FragColorbgra8[x*4+0] = d[0];
3827                         buffer_FragColorbgra8[x*4+1] = d[1];
3828                         buffer_FragColorbgra8[x*4+2] = d[2];
3829                         buffer_FragColorbgra8[x*4+3] = d[3];
3830                 }
3831         }
3832         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3833         {
3834                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3835                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3836                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3837                 Color_Diffuse[3] = 0.0f;
3838                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3839                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3840                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3841                 LightColor[3] = 0.0f;
3842                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3843
3844                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3845                 {
3846                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3847                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3848                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3849                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3850                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3851                 }
3852                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3853                 {
3854                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3855                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3856                 }
3857                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3858                 {
3859                         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3860                 }
3861                 else
3862                 {
3863                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3864                 }
3865
3866                 for (x = startx;x < endx;x++)
3867                 {
3868                         z = buffer_z[x];
3869                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3870                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3871                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3872                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3873                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3874                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3875                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3876                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3877
3878                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3879                         {
3880                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3881                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3882                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3883                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3884
3885                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3886                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3887                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3888                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3889
3890                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3891                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3892                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3893                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3894
3895                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3896                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3897                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3898                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3899
3900                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3901                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3902
3903                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3904                                 {
3905                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3906                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3907                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3908                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3909                                 }
3910                         }
3911                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3912                         {
3913                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3914                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3915                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3916                                 {
3917                                         float f = 1.0f / 256.0f;
3918                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3919                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3920                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3921                                 }
3922                         }
3923                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3924                         {
3925                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3926                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3927                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3928                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3929
3930                                 LightColor[0] = 1.0;
3931                                 LightColor[1] = 1.0;
3932                                 LightColor[2] = 1.0;
3933                         }
3934                         else
3935                         {
3936                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3937                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3938                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3939                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3940                         }
3941
3942                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3943                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3944                         {
3945                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3946                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3947                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3948                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3949                         }
3950                         else
3951                         {
3952                                 d[0] = (int)(                                                + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3953                                 d[1] = (int)(                                                + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3954                                 d[2] = (int)(                                                + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3955                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3956                         }
3957                         buffer_FragColorbgra8[x*4+0] = d[0];
3958                         buffer_FragColorbgra8[x*4+1] = d[1];
3959                         buffer_FragColorbgra8[x*4+2] = d[2];
3960                         buffer_FragColorbgra8[x*4+3] = d[3];
3961                 }
3962         }
3963         else
3964         {
3965                 for (x = startx;x < endx;x++)
3966                 {
3967                         z = buffer_z[x];
3968                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3969                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3970                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3971                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3972
3973                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3974                         {
3975                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3976                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3977                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3978                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3979                         }
3980                         else
3981                         {
3982                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3983                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3984                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3985                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3986                         }
3987                         buffer_FragColorbgra8[x*4+0] = d[0];
3988                         buffer_FragColorbgra8[x*4+1] = d[1];
3989                         buffer_FragColorbgra8[x*4+2] = d[2];
3990                         buffer_FragColorbgra8[x*4+3] = d[3];
3991                 }
3992         }
3993         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3994 }
3995
3996
3997
3998 void DPSOFTRAST_VertexShader_LightSource(void)
3999 {
4000         int i;
4001         int numvertices = dpsoftrast.numvertices;
4002         float LightPosition[4];
4003         float LightVector[4];
4004         float LightVectorModelSpace[4];
4005         float EyePosition[4];
4006         float EyeVectorModelSpace[4];
4007         float EyeVector[4];
4008         float position[4];
4009         float svector[4];
4010         float tvector[4];
4011         float normal[4];
4012         LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4013         LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4014         LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4015         LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4016         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4017         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4018         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4019         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4020         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4021         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4022         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4023         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4024         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4025         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4026         for (i = 0;i < numvertices;i++)
4027         {
4028                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4029                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4030                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4031                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4032                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4033                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4034                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4035                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4036                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4037                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4038                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4039                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4040                 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4041                 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4042                 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4043                 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4044                 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4045                 LightVector[2] = normal[0]  * LightVectorModelSpace[0] + normal[1]  * LightVectorModelSpace[1] + normal[2]  * LightVectorModelSpace[2];
4046                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4047                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4048                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4049                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4050                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4051                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4052                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4053                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4054                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4055                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
4056                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4057                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4058                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4059                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4060         }
4061         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4062         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4063 }
4064
4065 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4066 {
4067 #ifdef SSE_POSSIBLE
4068         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4069         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4070         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4071         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4072         unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4073         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4074         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4075         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4076         int x, startx = span->startx, endx = span->endx;
4077         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4078         float CubeVectordata[4];
4079         float CubeVectorslope[4];
4080         float LightVectordata[4];
4081         float LightVectorslope[4];
4082         float EyeVectordata[4];
4083         float EyeVectorslope[4];
4084         float z;
4085         float diffusetex[4];
4086         float glosstex[4];
4087         float surfacenormal[4];
4088         float lightnormal[4];
4089         float eyenormal[4];
4090         float specularnormal[4];
4091         float diffuse;
4092         float specular;
4093         float SpecularPower;
4094         float CubeVector[4];
4095         float attenuation;
4096         int d[4];
4097         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4098         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4099         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4100         Color_Glow[3] = 0.0f;
4101         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4102         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4103         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4104         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4105         Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4106         Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4107         Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4108         Color_Diffuse[3] = 0.0f;
4109         Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4110         Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4111         Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4112         Color_Specular[3] = 0.0f;
4113         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4114         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4115         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4116         Color_Pants[3] = 0.0f;
4117         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4118         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4119         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4120         Color_Shirt[3] = 0.0f;
4121         LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4122         LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4123         LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4124         LightColor[3] = 0.0f;
4125         SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4126         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4127         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4128         DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4129         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4130         memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4131         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4132         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4133         {
4134                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4135                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4136         }
4137         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4138                 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4139         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4140         {
4141                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4142                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4143                 for (x = startx;x < endx;x++)
4144                 {
4145                         z = buffer_z[x];
4146                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4147                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4148                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4149                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4150                         if (attenuation < 0.01f)
4151                                 continue;
4152                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4153                         {
4154                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4155                                 if (attenuation < 0.01f)
4156                                         continue;
4157                         }
4158
4159                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4160                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4161                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4162                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4163                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4164                         {
4165                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4166                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4167                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4168                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4169                         }
4170                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4171                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4172                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4173                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4174                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4175                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4176                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4177                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4178
4179                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4180                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4181                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4182                         DPSOFTRAST_Vector3Normalize(lightnormal);
4183
4184                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4185
4186                         if(thread->shader_exactspecularmath)
4187                         {
4188                                 // reflect lightnormal at surfacenormal, take the negative of that
4189                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4190                                 float f;
4191                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4192                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4193                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4194                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4195
4196                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
4197                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4198                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4199                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4200                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4201
4202                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4203                         }
4204                         else
4205                         {
4206                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4207                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4208                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4209                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4210
4211                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
4212                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
4213                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
4214                                 DPSOFTRAST_Vector3Normalize(specularnormal);
4215
4216                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4217                         }
4218                         specular = pow(specular, SpecularPower * glosstex[3]);
4219
4220                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4221                         {
4222                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4223                                 attenuation *= (1.0f / 255.0f);
4224                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4225                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4226                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4227                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4228                         }
4229                         else
4230                         {
4231                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4232                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4233                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4234                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4235                         }
4236                         buffer_FragColorbgra8[x*4+0] = d[0];
4237                         buffer_FragColorbgra8[x*4+1] = d[1];
4238                         buffer_FragColorbgra8[x*4+2] = d[2];
4239                         buffer_FragColorbgra8[x*4+3] = d[3];
4240                 }
4241         }
4242         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4243         {
4244                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4245                 for (x = startx;x < endx;x++)
4246                 {
4247                         z = buffer_z[x];
4248                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4249                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4250                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4251                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4252                         if (attenuation < 0.01f)
4253                                 continue;
4254                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4255                         {
4256                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4257                                 if (attenuation < 0.01f)
4258                                         continue;
4259                         }
4260
4261                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4262                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4263                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4264                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4265                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4266                         {
4267                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4268                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4269                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4270                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4271                         }
4272                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4273                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4274                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4275                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4276
4277                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4278                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4279                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4280                         DPSOFTRAST_Vector3Normalize(lightnormal);
4281
4282                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4283                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4284                         {
4285                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4286                                 attenuation *= (1.0f / 255.0f);
4287                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4288                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4289                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4290                                 d[3] = (int)( diffusetex[3]                                                                                                                   );if (d[3] > 255) d[3] = 255;
4291                         }
4292                         else
4293                         {
4294                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4295                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4296                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4297                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4298                         }
4299                         buffer_FragColorbgra8[x*4+0] = d[0];
4300                         buffer_FragColorbgra8[x*4+1] = d[1];
4301                         buffer_FragColorbgra8[x*4+2] = d[2];
4302                         buffer_FragColorbgra8[x*4+3] = d[3];
4303                 }
4304         }
4305         else
4306         {
4307                 for (x = startx;x < endx;x++)
4308                 {
4309                         z = buffer_z[x];
4310                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4311                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4312                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4313                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4314                         if (attenuation < 0.01f)
4315                                 continue;
4316                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4317                         {
4318                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4319                                 if (attenuation < 0.01f)
4320                                         continue;
4321                         }
4322
4323                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4324                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4325                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4326                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4327                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4328                         {
4329                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4330                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4331                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4332                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4333                         }
4334                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4335                         {
4336                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4337                                 attenuation *= (1.0f / 255.0f);
4338                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4339                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4340                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4341                                 d[3] = (int)( diffusetex[3]                                                                                      );if (d[3] > 255) d[3] = 255;
4342                         }
4343                         else
4344                         {
4345                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4346                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4347                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4348                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4349                         }
4350                         buffer_FragColorbgra8[x*4+0] = d[0];
4351                         buffer_FragColorbgra8[x*4+1] = d[1];
4352                         buffer_FragColorbgra8[x*4+2] = d[2];
4353                         buffer_FragColorbgra8[x*4+3] = d[3];
4354                 }
4355         }
4356         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4357 #endif
4358 }
4359
4360
4361
4362 void DPSOFTRAST_VertexShader_Refraction(void)
4363 {
4364         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4365         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4366         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4367 }
4368
4369 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4370 {
4371         // DIRTY TRICK: only do sideways displacement. Not correct, but cheaper and thus better for SW.
4372
4373         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4374         float z;
4375         int x, startx = span->startx, endx = span->endx;
4376
4377         // texture reads
4378         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4379         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4380
4381         // varyings
4382         float ModelViewProjectionPositiondata[4];
4383         float ModelViewProjectionPositionslope[4];
4384
4385         // uniforms
4386         float ScreenScaleRefractReflect[2];
4387         float ScreenCenterRefractReflect[2];
4388         float DistortScaleRefractReflect[2];
4389         float RefractColor[4];
4390
4391         const unsigned char * RESTRICT pixelbase;
4392         const unsigned char * RESTRICT pixel[4];
4393         DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4394         if(!texture) return;
4395         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[0][0];
4396
4397         // read textures
4398         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4399         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4400
4401         // read varyings
4402         DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD1); // or POSITION?
4403
4404         // read uniforms
4405         ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4406         ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4407         ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4408         ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4409         DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4410         DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4411         RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4412         RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4413         RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4414         RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4415
4416         // do stuff
4417         for (x = startx;x < endx;x++)
4418         {
4419                 float SafeScreenTexCoord[2];
4420                 float ScreenTexCoord[2];
4421                 float v[3];
4422                 float iw;
4423                 unsigned char c[4];
4424
4425                 z = buffer_z[x];
4426
4427                 // "    vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4428                 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4429                 
4430                 // "    vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4431                 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4432                 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4433
4434                 // "    vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4435                 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4436                 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4437                 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4438                 DPSOFTRAST_Vector3Normalize(v);
4439                 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4440                 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4441
4442                 // "    dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4443                 if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4444                 {
4445                         unsigned int tc[2] = { ScreenTexCoord[0] * (texture->mipmap[0][2]<<12) - 2048, ScreenTexCoord[1] * (texture->mipmap[0][3]<<12) - 2048};
4446                         unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
4447                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
4448                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
4449                         int tci[2] = { tc[0]>>12, tc[1]>>12 };
4450                         int tci1[2] = { tci[0] + 1, tci[1] + 1 };
4451                         tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4452                         tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4453                         tci1[0] = tci1[0] >= 0 ? (tci1[0] <= texture->mipmap[0][2]-1 ? tci1[0] : texture->mipmap[0][2]-1) : 0;
4454                         tci1[1] = tci1[1] >= 0 ? (tci1[1] <= texture->mipmap[0][3]-1 ? tci1[1] : texture->mipmap[0][3]-1) : 0;
4455                         pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4456                         pixel[1] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci1[0]);
4457                         pixel[2] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci[0]);
4458                         pixel[3] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci1[0]);
4459                         c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
4460                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
4461                         c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
4462                 }
4463                 else
4464                 {
4465                         int tci[2] = { ScreenTexCoord[0] * texture->mipmap[0][2], ScreenTexCoord[1] * texture->mipmap[0][3] };
4466                         tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4467                         tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4468                         pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4469                         c[0] = pixel[0][0];
4470                         c[1] = pixel[0][1];
4471                         c[2] = pixel[0][2];
4472                 }
4473
4474                 //p = (int) bound(startx, x + (ScreenTexCoord[0] - SafeScreenTexCoord[0]) / (ModelViewProjectionPositionslope[0]*z), endx-1);
4475                 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4476                 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4477                 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4478                 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4479         }
4480
4481         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4482 }
4483
4484
4485
4486 void DPSOFTRAST_VertexShader_Water(void)
4487 {
4488         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4489 }
4490
4491
4492 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4493 {
4494         // TODO: IMPLEMENT
4495         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4496         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4497         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4498         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4499         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4500 }
4501
4502
4503
4504 void DPSOFTRAST_VertexShader_ShowDepth(void)
4505 {
4506         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4507 }
4508
4509 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4510 {
4511         // TODO: IMPLEMENT
4512         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4513         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4514         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4515         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4516         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4517 }
4518
4519
4520
4521 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4522 {
4523         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4524 }
4525
4526 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4527 {
4528         // TODO: IMPLEMENT
4529         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4530         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4531         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4532         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4533         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4534 }
4535
4536
4537
4538 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4539 {
4540         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4541 }
4542
4543 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4544 {
4545         // TODO: IMPLEMENT
4546         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4547         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4548         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4549         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4550         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4551 }
4552
4553
4554
4555 typedef struct DPSOFTRAST_ShaderModeInfo_s
4556 {
4557         int lodarrayindex;
4558         void (*Vertex)(void);
4559         void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4560         unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4561         unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4562 }
4563 DPSOFTRAST_ShaderModeInfo;
4564
4565 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4566 {
4567         {2, DPSOFTRAST_VertexShader_Generic,                        DPSOFTRAST_PixelShader_Generic,                        {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4568         {2, DPSOFTRAST_VertexShader_PostProcess,                    DPSOFTRAST_PixelShader_PostProcess,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4569         {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow,                DPSOFTRAST_PixelShader_Depth_Or_Shadow,                {~0}, {~0}},
4570         {2, DPSOFTRAST_VertexShader_FlatColor,                      DPSOFTRAST_PixelShader_FlatColor,                      {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4571         {2, DPSOFTRAST_VertexShader_VertexColor,                    DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4572         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4573         {2, DPSOFTRAST_VertexShader_FakeLight,                      DPSOFTRAST_PixelShader_FakeLight,                      {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4574         {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace,   {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4575         {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4576         {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,                 {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4577         {2, DPSOFTRAST_VertexShader_LightSource,                    DPSOFTRAST_PixelShader_LightSource,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4578         {2, DPSOFTRAST_VertexShader_Refraction,                     DPSOFTRAST_PixelShader_Refraction,                     {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4579         {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                          {~0}},
4580         {2, DPSOFTRAST_VertexShader_ShowDepth,                      DPSOFTRAST_PixelShader_ShowDepth,                      {~0}},
4581         {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,               {~0}},
4582         {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}},
4583 };
4584
4585 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span)
4586 {
4587         int x;
4588         int startx;
4589         int endx;
4590         unsigned int *depthpixel;
4591         int depth;
4592         int depthslope;
4593         unsigned int d;
4594         unsigned char *pixelmask;
4595         DPSOFTRAST_State_Triangle *triangle;
4596         triangle = &thread->triangles[span->triangle];
4597         depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4598         startx = span->startx;
4599         endx = span->endx;
4600         depth = span->depthbase;
4601         depthslope = span->depthslope;
4602         pixelmask = thread->pixelmaskarray;
4603         if (thread->depthtest && dpsoftrast.fb_depthpixels)
4604         {
4605                 switch(thread->fb_depthfunc)
4606                 {
4607                 default:
4608                 case GL_ALWAYS:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4609                 case GL_LESS:    for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4610                 case GL_LEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4611                 case GL_EQUAL:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4612                 case GL_GEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4613                 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4614                 case GL_NEVER:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4615                 }
4616                 while (startx < endx && !pixelmask[startx])
4617                         startx++;
4618                 while (endx > startx && !pixelmask[endx-1])
4619                         endx--;
4620         }
4621         else
4622         {
4623                 // no depth testing means we're just dealing with color...
4624                 memset(pixelmask + startx, 1, endx - startx);
4625         }
4626         span->pixelmask = pixelmask;
4627         span->startx = startx;
4628         span->endx = endx;
4629 }
4630
4631 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span)
4632 {
4633         int x, d, depth, depthslope, startx, endx;
4634         const unsigned char *pixelmask;
4635         unsigned int *depthpixel;
4636         if (thread->depthmask && thread->depthtest && dpsoftrast.fb_depthpixels)
4637         {
4638                 depth = span->depthbase;
4639                 depthslope = span->depthslope;
4640                 pixelmask = span->pixelmask;
4641                 startx = span->startx;
4642                 endx = span->endx;
4643                 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4644                 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4645                         if (pixelmask[x])
4646                                 depthpixel[x] = d;
4647         }
4648 }
4649
4650 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4651 {
4652         int i;
4653         DPSOFTRAST_State_Triangle *triangle;
4654         DPSOFTRAST_State_Span *span;
4655         for (i = 0; i < thread->numspans; i++)
4656         {
4657                 span = &thread->spans[i];
4658                 triangle = &thread->triangles[span->triangle];
4659                 DPSOFTRAST_Draw_DepthTest(thread, span);
4660                 if (span->startx >= span->endx)
4661                         continue;
4662                 // run pixel shader if appropriate
4663                 // do this before running depthmask code, to allow the pixelshader
4664                 // to clear pixelmask values for alpha testing
4665                 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4666                         DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4667                 DPSOFTRAST_Draw_DepthWrite(thread, span);
4668         }
4669         thread->numspans = 0;
4670 }
4671
4672 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4673
4674 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4675 {
4676 #ifdef SSE_POSSIBLE
4677         int cullface = thread->cullface;
4678         int minx, maxx, miny, maxy;
4679         int miny1, maxy1, miny2, maxy2;
4680         __m128i fbmin, fbmax;
4681         __m128 viewportcenter, viewportscale;
4682         int firstvertex = command->firstvertex;
4683         int numvertices = command->numvertices;
4684         int numtriangles = command->numtriangles;
4685         const int *element3i = command->element3i;
4686         const unsigned short *element3s = command->element3s;
4687         int clipped = command->clipped;
4688         int i;
4689         int j;
4690         int k;
4691         int y;
4692         int e[3];
4693         __m128i screeny;
4694         int starty, endy, bandy;
4695         int numpoints;
4696         int clipcase;
4697         float clipdist[4];
4698         float clip0origin, clip0slope;
4699         int clip0dir;
4700         __m128 triangleedge1, triangleedge2, trianglenormal;
4701         __m128 clipfrac[3];
4702         __m128 screen[4];
4703         DPSOFTRAST_State_Triangle *triangle;
4704         DPSOFTRAST_Texture *texture;
4705         DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4706         miny = thread->fb_scissor[1];
4707         maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4708         miny1 = bound(miny, thread->miny1, maxy);
4709         maxy1 = bound(miny, thread->maxy1, maxy);
4710         miny2 = bound(miny, thread->miny2, maxy);
4711         maxy2 = bound(miny, thread->maxy2, maxy);
4712         if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4713         {
4714                 if (!ATOMIC_DECREMENT(command->refcount))
4715                 {
4716                         if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4717                                 MM_FREE(command->arrays);
4718                 }
4719                 return;
4720         }
4721         minx = thread->fb_scissor[0];
4722         maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4723         fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4724         fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4725         viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4726         viewportscale = _mm_load_ps(thread->fb_viewportscale);
4727         screen[3] = _mm_setzero_ps();
4728         clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4729         for (i = 0;i < numtriangles;i++)
4730         {
4731                 const float *screencoord4f = command->arrays;
4732                 const float *arrays = screencoord4f + numvertices*4;
4733
4734                 // generate the 3 edges of this triangle
4735                 // generate spans for the triangle - switch based on left split or right split classification of triangle
4736                 if (element3s)
4737                 {
4738                         e[0] = element3s[i*3+0] - firstvertex;
4739                         e[1] = element3s[i*3+1] - firstvertex;
4740                         e[2] = element3s[i*3+2] - firstvertex;
4741                 }
4742                 else if (element3i)
4743                 {
4744                         e[0] = element3i[i*3+0] - firstvertex;
4745                         e[1] = element3i[i*3+1] - firstvertex;
4746                         e[2] = element3i[i*3+2] - firstvertex;
4747                 }
4748                 else
4749                 {
4750                         e[0] = i*3+0;
4751                         e[1] = i*3+1;
4752                         e[2] = i*3+2;
4753                 }
4754
4755 #define SKIPBACKFACE \
4756                 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4757                 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4758                 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4759                 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4760                                                                         _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4761                 switch(cullface) \
4762                 { \
4763                 case GL_BACK: \
4764                         if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4765                                 continue; \
4766                         break; \
4767                 case GL_FRONT: \
4768                         if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4769                                 continue; \
4770                         break; \
4771                 }
4772
4773 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4774                         clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4775                         { \
4776                                 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4777                                 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4778                         }
4779 #define CLIPPEDVERTEXCOPY(k,p1) \
4780                         screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4781
4782 #define GENATTRIBCOPY(attrib, p1) \
4783                 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4784 #define GENATTRIBLERP(attrib, p1, p2) \
4785                 { \
4786                         __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4787                         attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4788                 }
4789 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4790                 switch(clipcase) \
4791                 { \
4792                 default: \
4793                 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4794                 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4795                 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4796                 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4797                 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4798                 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4799                 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4800                 }
4801
4802                 if (! clipped)
4803                         goto notclipped;
4804
4805                 // calculate distance from nearplane
4806                 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4807                 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4808                 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4809                 if (clipdist[0] >= 0.0f)
4810                 {
4811                         if (clipdist[1] >= 0.0f)
4812                         {
4813                                 if (clipdist[2] >= 0.0f)
4814                                 {
4815                                 notclipped:
4816                                         // triangle is entirely in front of nearplane
4817                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4818                                         SKIPBACKFACE;
4819                                         numpoints = 3;
4820                                         clipcase = 0;
4821                                 }
4822                                 else
4823                                 {
4824                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4825                                         SKIPBACKFACE;
4826                                         numpoints = 4;
4827                                         clipcase = 1;
4828                                 }
4829                         }
4830                         else
4831                         {
4832                                 if (clipdist[2] >= 0.0f)
4833                                 {
4834                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4835                                         SKIPBACKFACE;
4836                                         numpoints = 4;
4837                                         clipcase = 2;
4838                                 }
4839                                 else
4840                                 {
4841                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4842                                         SKIPBACKFACE;
4843                                         numpoints = 3;
4844                                         clipcase = 3;
4845                                 }
4846                         }
4847                 }
4848                 else if (clipdist[1] >= 0.0f)
4849                 {
4850                         if (clipdist[2] >= 0.0f)
4851                         {
4852                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4853                                 SKIPBACKFACE;
4854                                 numpoints = 4;
4855                                 clipcase = 4;
4856                         }
4857                         else
4858                         {
4859                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4860                                 SKIPBACKFACE;
4861                                 numpoints = 3;
4862                                 clipcase = 5;
4863                         }
4864                 }
4865                 else if (clipdist[2] >= 0.0f)
4866                 {
4867                         CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4868                         SKIPBACKFACE;
4869                         numpoints = 3;
4870                         clipcase = 6;
4871                 }
4872                 else continue; // triangle is entirely behind nearplane
4873
4874                 {
4875                         // calculate integer y coords for triangle points
4876                         __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4877                                         screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4878                                         screenmin = _mm_min_epi16(screeni, screenir),
4879                                         screenmax = _mm_max_epi16(screeni, screenir);
4880                         screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4881                         screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4882                         screenmin = _mm_max_epi16(screenmin, fbmin);
4883                         screenmax = _mm_min_epi16(screenmax, fbmax);
4884                         // skip offscreen triangles
4885                         if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4886                                 continue;
4887                         starty = _mm_extract_epi16(screenmin, 1);
4888                         endy = _mm_extract_epi16(screenmax, 1)+1;
4889                         if (starty >= maxy1 && endy <= miny2)
4890                                 continue;
4891                         screeny = _mm_srai_epi32(screeni, 16);
4892                 }
4893
4894                 triangle = &thread->triangles[thread->numtriangles];
4895
4896                 // calculate attribute plans for triangle data...
4897                 // okay, this triangle is going to produce spans, we'd better project
4898                 // the interpolants now (this is what gives perspective texturing),
4899                 // this consists of simply multiplying all arrays by the W coord
4900                 // (which is basically 1/Z), which will be undone per-pixel
4901                 // (multiplying by Z again) to get the perspective-correct array
4902                 // values
4903                 {
4904                         __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4905                         __m128 mipedgescale, mipdensity;
4906                         attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4907                         attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4908                         attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4909                         attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4910                         attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4911                         w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4912                         w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4913                         w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4914                         attribedge1 = _mm_sub_ss(w0, w1);
4915                         attribedge2 = _mm_sub_ss(w2, w1);
4916                         attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4917                         attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4918                         x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4919                         y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4920                         attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4921                         _mm_store_ss(&triangle->w[0], attribxslope);
4922                         _mm_store_ss(&triangle->w[1], attribyslope);
4923                         _mm_store_ss(&triangle->w[2], attriborigin);
4924                         
4925                         clip0origin = 0;
4926                         clip0slope = 0;
4927                         clip0dir = 0;
4928                         if(thread->fb_clipplane[0] || thread->fb_clipplane[1] || thread->fb_clipplane[2])
4929                         {
4930                                 float cliporigin, clipxslope, clipyslope;
4931                                 attriborigin = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(2, 2, 2, 2));
4932                                 attribedge1 = _mm_sub_ss(_mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
4933                                 attribedge2 = _mm_sub_ss(_mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
4934                                 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4935                                 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4936                                 attriborigin = _mm_sub_ss(attriborigin, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4937                                 cliporigin = _mm_cvtss_f32(attriborigin)*thread->fb_clipplane[2] + thread->fb_clipplane[3];
4938                                 clipxslope = thread->fb_clipplane[0] + _mm_cvtss_f32(attribxslope)*thread->fb_clipplane[2];
4939                                 clipyslope = thread->fb_clipplane[1] + _mm_cvtss_f32(attribyslope)*thread->fb_clipplane[2];
4940                                 if(clipxslope != 0)
4941                                 {
4942                                         clip0origin = -cliporigin/clipxslope;
4943                                         clip0slope = -clipyslope/clipxslope;
4944                                         clip0dir = clipxslope > 0 ? 1 : -1;
4945                                 }
4946                                 else if(clipyslope > 0)
4947                                 {
4948                                         clip0origin = dpsoftrast.fb_width*floor(cliporigin/clipyslope);
4949                                         clip0slope = dpsoftrast.fb_width;
4950                                         clip0dir = -1;
4951                                 }
4952                                 else if(clipyslope < 0)
4953                                 {
4954                                         clip0origin = dpsoftrast.fb_width*ceil(cliporigin/clipyslope);
4955                                         clip0slope = -dpsoftrast.fb_width;
4956                                         clip0dir = -1;
4957                                 }
4958                                 else if(clip0origin < 0) continue;
4959                         }
4960
4961                         mipedgescale = _mm_setzero_ps();
4962                         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4963                         {
4964                                 __m128 attrib0, attrib1, attrib2;
4965                                 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4966                                 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4967                                         break;
4968                                 arrays += numvertices*4;
4969                                 GENATTRIBS(attrib0, attrib1, attrib2);
4970                                 attriborigin = _mm_mul_ps(attrib1, w1);
4971                                 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4972                                 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4973                                 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4974                                 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4975                                 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4976                                 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
4977                                 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
4978                                 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
4979                                 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4980                                 {
4981                                         mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4982                                         mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4983                                         mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4984                                         mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4985                                 }
4986                         }
4987
4988                         memset(triangle->mip, 0, sizeof(triangle->mip));
4989                         for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4990                         {
4991                                 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4992                                 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4993                                         break;
4994                                 texture = thread->texbound[texunit];
4995                                 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4996                                 {
4997                                         mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4998                                         mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4999                                         mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
5000                                         mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
5001                                         // this will be multiplied in the texturing routine by the texture resolution
5002                                         y = _mm_cvtss_si32(mipdensity);
5003                                         if (y > 0)
5004                                         {
5005                                                 y = (int)(log((float)y)*0.5f/M_LN2);
5006                                                 if (y > texture->mipmaps - 1)
5007                                                         y = texture->mipmaps - 1;
5008                                                 triangle->mip[texunit] = y;
5009                                         }
5010                                 }
5011                         }
5012                 }
5013         
5014                 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
5015                 for (; y < bandy;)
5016                 {
5017                         __m128 xcoords, xslope;
5018                         __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
5019                         int yccmask = _mm_movemask_epi8(ycc);
5020                         int edge0p, edge0n, edge1p, edge1n;
5021                         int nexty;
5022                         float w, wslope;
5023                         float clip0;
5024                         if (numpoints == 4)
5025                         {
5026                                 switch(yccmask)
5027                                 {
5028                                 default:
5029                                 case 0xFFFF: /*0000*/ y = endy; continue;
5030                                 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
5031                                 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5032                                 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
5033                                 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
5034                                 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5035                                 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5036                                 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5037                                 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5038                                 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5039                                 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5040                                 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5041                                 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5042                                 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5043                                 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5044                                 case 0x0000: /*1111*/ y++; continue;
5045                                 }
5046                         }
5047                         else
5048                         {
5049                                 switch(yccmask)
5050                                 {
5051                                 default:
5052                                 case 0xFFFF: /*000*/ y = endy; continue;
5053                                 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5054                                 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5055                                 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5056                                 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5057                                 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5058                                 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5059                                 case 0x0000: /*111*/ y++; continue;
5060                                 }
5061                         }
5062                         ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5063                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5064                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5065                         nexty = _mm_extract_epi16(ycc, 0);
5066                         if (nexty >= bandy) nexty = bandy-1;
5067                         xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5068                         xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5069                         xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5070                                                                 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5071                         xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5072                         if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5073                         {
5074                                 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5075                                 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5076                         }
5077                         clip0 = clip0origin + (y+0.5f)*clip0slope + 0.5f;
5078                         for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope), clip0 += clip0slope)
5079                         {
5080                                 int startx, endx, offset;
5081                                 startx = _mm_cvtss_si32(xcoords);
5082                                 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5083                                 if (startx < minx) startx = minx;
5084                                 if (endx > maxx) endx = maxx;
5085                                 if (startx >= endx) continue;
5086
5087                                 if (clip0dir)
5088                                 {
5089                                         if (clip0dir > 0)
5090                                         {
5091                                                 if (startx < clip0) 
5092                                                 {
5093                                                         if(endx <= clip0) continue;
5094                                                         startx = (int)clip0;
5095                                                 }
5096                                         }
5097                                         else if (endx > clip0) 
5098                                         {
5099                                                 if(startx >= clip0) continue;
5100                                                 endx = (int)clip0;
5101                                         }
5102                                 }
5103                                                 
5104                                 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5105                                 {
5106                                         DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5107                                         span->triangle = thread->numtriangles;
5108                                         span->x = offset;
5109                                         span->y = y;
5110                                         span->startx = 0;
5111                                         span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5112                                         if (span->startx >= span->endx)
5113                                                 continue;
5114                                         wslope = triangle->w[0];
5115                                         w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
5116                                         span->depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
5117                                         span->depthbase = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
5118                                         if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5119                                                 DPSOFTRAST_Draw_ProcessSpans(thread);
5120                                 }
5121                         }
5122                 }
5123
5124                 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5125                 {
5126                         DPSOFTRAST_Draw_ProcessSpans(thread);
5127                         thread->numtriangles = 0;
5128                 }
5129         }
5130
5131         if (!ATOMIC_DECREMENT(command->refcount))
5132         {
5133                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5134                         MM_FREE(command->arrays);
5135         }
5136
5137         if (thread->numspans > 0 || thread->numtriangles > 0)
5138         {
5139                 DPSOFTRAST_Draw_ProcessSpans(thread);
5140                 thread->numtriangles = 0;
5141         }
5142 #endif
5143 }
5144
5145 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5146 {
5147         int i;
5148         int j;
5149         int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5150         int datasize = 2*numvertices*sizeof(float[4]);
5151         DPSOFTRAST_Command_Draw *command;
5152         unsigned char *data;
5153         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5154         {
5155                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5156                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5157                         break;
5158                 datasize += numvertices*sizeof(float[4]);
5159         }
5160         if (element3s)
5161                 datasize += numtriangles*sizeof(unsigned short[3]);
5162         else if (element3i)
5163                 datasize += numtriangles*sizeof(int[3]);
5164         datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5165         if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5166         {
5167                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5168                 data = (unsigned char *)MM_CALLOC(datasize, 1);
5169         }
5170         else
5171         {
5172                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5173                 data = (unsigned char *)command + commandsize;
5174         }
5175         command->firstvertex = firstvertex;
5176         command->numvertices = numvertices;
5177         command->numtriangles = numtriangles;
5178         command->arrays = (float *)data;
5179         memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5180         dpsoftrast.firstvertex = firstvertex;
5181         dpsoftrast.numvertices = numvertices;
5182         dpsoftrast.screencoord4f = (float *)data;
5183         data += numvertices*sizeof(float[4]);
5184         dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5185         data += numvertices*sizeof(float[4]);
5186         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5187         {
5188                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5189                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5190                         break;
5191                 dpsoftrast.post_array4f[j] = (float *)data;
5192                 data += numvertices*sizeof(float[4]);
5193         }
5194         command->element3i = NULL;
5195         command->element3s = NULL;
5196         if (element3s)
5197         {
5198                 command->element3s = (unsigned short *)data;
5199                 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5200         }
5201         else if (element3i)
5202         {
5203                 command->element3i = (int *)data;
5204                 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5205         }
5206         return command;
5207 }
5208
5209 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5210 {
5211         DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5212         DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5213         command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5214         command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5215         if (command->starty >= command->endy)
5216         {
5217                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5218                         MM_FREE(command->arrays);
5219                 DPSOFTRAST_UndoCommand(command->commandsize);
5220                 return;
5221         }
5222         command->clipped = dpsoftrast.drawclipped;
5223         command->refcount = dpsoftrast.numthreads;
5224
5225         if (dpsoftrast.usethreads)
5226         {
5227                 int i;
5228                 DPSOFTRAST_Draw_SyncCommands();
5229                 for (i = 0; i < dpsoftrast.numthreads; i++)
5230                 {
5231                         DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5232                         if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5233                                 Thread_CondSignal(thread->drawcond);
5234                 }
5235         }
5236         else
5237         {
5238                 DPSOFTRAST_Draw_FlushThreads();
5239         }
5240 }
5241
5242 DEFCOMMAND(23, SetRenderTargets, int width; int height;);
5243 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5244 {
5245         thread->validate |= DPSOFTRAST_VALIDATE_FB;
5246 }
5247 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5248 {
5249         DPSOFTRAST_Command_SetRenderTargets *command;
5250         if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5251                 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5252                 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5253                 DPSOFTRAST_Flush();
5254         dpsoftrast.fb_width = width;
5255         dpsoftrast.fb_height = height;
5256         dpsoftrast.fb_depthpixels = depthpixels;
5257         dpsoftrast.fb_colorpixels[0] = colorpixels0;
5258         dpsoftrast.fb_colorpixels[1] = colorpixels1;
5259         dpsoftrast.fb_colorpixels[2] = colorpixels2;
5260         dpsoftrast.fb_colorpixels[3] = colorpixels3;
5261         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5262         command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5263         command->width = width;
5264         command->height = height;
5265 }
5266  
5267 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5268 {
5269         int commandoffset = thread->commandoffset;
5270         while (commandoffset != endoffset)
5271         {
5272                 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5273                 switch (command->opcode)
5274                 {
5275 #define INTERPCOMMAND(name) \
5276                 case DPSOFTRAST_OPCODE_##name : \
5277                         DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5278                         commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5279                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5280                                 commandoffset = 0; \
5281                         break;
5282                 INTERPCOMMAND(Viewport)
5283                 INTERPCOMMAND(ClearColor)
5284                 INTERPCOMMAND(ClearDepth)
5285                 INTERPCOMMAND(ColorMask)
5286                 INTERPCOMMAND(DepthTest)
5287                 INTERPCOMMAND(ScissorTest)
5288                 INTERPCOMMAND(Scissor)
5289                 INTERPCOMMAND(BlendFunc)
5290                 INTERPCOMMAND(BlendSubtract)
5291                 INTERPCOMMAND(DepthMask)
5292                 INTERPCOMMAND(DepthFunc)
5293                 INTERPCOMMAND(DepthRange)
5294                 INTERPCOMMAND(PolygonOffset)
5295                 INTERPCOMMAND(CullFace)
5296                 INTERPCOMMAND(AlphaTest)
5297                 INTERPCOMMAND(AlphaFunc)
5298                 INTERPCOMMAND(SetTexture)
5299                 INTERPCOMMAND(SetShader)
5300                 INTERPCOMMAND(Uniform4f)
5301                 INTERPCOMMAND(UniformMatrix4f)
5302                 INTERPCOMMAND(Uniform1i)
5303                 INTERPCOMMAND(SetRenderTargets)
5304                 INTERPCOMMAND(ClipPlane)
5305
5306                 case DPSOFTRAST_OPCODE_Draw:
5307                         DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5308                         commandoffset += command->commandsize;
5309                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5310                                 commandoffset = 0;
5311                         thread->commandoffset = commandoffset;
5312                         break;
5313
5314                 case DPSOFTRAST_OPCODE_Reset:
5315                         commandoffset = 0;
5316                         break;
5317                 }
5318         }
5319         thread->commandoffset = commandoffset;
5320 }
5321
5322 static int DPSOFTRAST_Draw_Thread(void *data)
5323 {
5324         DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5325         while(thread->index >= 0)
5326         {
5327                 if (thread->commandoffset != dpsoftrast.drawcommand)
5328                 {
5329                         DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);      
5330                 }
5331                 else 
5332                 {
5333                         Thread_LockMutex(thread->drawmutex);
5334                         if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5335                         {
5336                                 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5337                                 thread->starving = true;
5338                                 Thread_CondWait(thread->drawcond, thread->drawmutex);
5339                                 thread->starving = false;
5340                         }
5341                         Thread_UnlockMutex(thread->drawmutex);
5342                 }
5343         }   
5344         return 0;
5345 }
5346
5347 static void DPSOFTRAST_Draw_FlushThreads(void)
5348 {
5349         DPSOFTRAST_State_Thread *thread;
5350         int i;
5351         DPSOFTRAST_Draw_SyncCommands();
5352         if (dpsoftrast.usethreads) 
5353         {
5354                 for (i = 0; i < dpsoftrast.numthreads; i++)
5355                 {
5356                         thread = &dpsoftrast.threads[i];
5357                         if (thread->commandoffset != dpsoftrast.drawcommand)
5358                         {
5359                                 Thread_LockMutex(thread->drawmutex);
5360                                 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5361                                         Thread_CondSignal(thread->drawcond);
5362                                 Thread_UnlockMutex(thread->drawmutex);
5363                         }
5364                 }
5365                 for (i = 0; i < dpsoftrast.numthreads; i++)
5366                 {
5367                         thread = &dpsoftrast.threads[i];
5368                         if (thread->commandoffset != dpsoftrast.drawcommand)
5369                         {
5370                                 Thread_LockMutex(thread->drawmutex);
5371                                 if (thread->commandoffset != dpsoftrast.drawcommand)
5372                                 {
5373                                         thread->waiting = true;
5374                                         Thread_CondWait(thread->waitcond, thread->drawmutex);
5375                                         thread->waiting = false;
5376                                 }
5377                                 Thread_UnlockMutex(thread->drawmutex);
5378                         }
5379                 }
5380         }
5381         else
5382         {
5383                 for (i = 0; i < dpsoftrast.numthreads; i++)
5384                 {
5385                         thread = &dpsoftrast.threads[i];
5386                         if (thread->commandoffset != dpsoftrast.drawcommand)
5387                                 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5388                 }
5389         }
5390         dpsoftrast.commandpool.usedcommands = 0;
5391 }
5392
5393 void DPSOFTRAST_Flush(void)
5394 {
5395         DPSOFTRAST_Draw_FlushThreads();
5396 }
5397
5398 void DPSOFTRAST_Finish(void)
5399 {
5400         DPSOFTRAST_Flush();
5401 }
5402
5403 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5404 {
5405         int i;
5406         union
5407         {
5408                 int i;
5409                 unsigned char b[4];
5410         }
5411         u;
5412         u.i = 1;
5413         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5414         dpsoftrast.bigendian = u.b[3];
5415         dpsoftrast.fb_width = width;
5416         dpsoftrast.fb_height = height;
5417         dpsoftrast.fb_depthpixels = depthpixels;
5418         dpsoftrast.fb_colorpixels[0] = colorpixels;
5419         dpsoftrast.fb_colorpixels[1] = NULL;
5420         dpsoftrast.fb_colorpixels[1] = NULL;
5421         dpsoftrast.fb_colorpixels[1] = NULL;
5422         dpsoftrast.viewport[0] = 0;
5423         dpsoftrast.viewport[1] = 0;
5424         dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5425         dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5426         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5427         dpsoftrast.texture_firstfree = 1;
5428         dpsoftrast.texture_end = 1;
5429         dpsoftrast.texture_max = 0;
5430         dpsoftrast.color[0] = 1;
5431         dpsoftrast.color[1] = 1;
5432         dpsoftrast.color[2] = 1;
5433         dpsoftrast.color[3] = 1;
5434         dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5435         dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5436         dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5437         dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5438         for (i = 0; i < dpsoftrast.numthreads; i++)
5439         {
5440                 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5441                 thread->index = i;
5442                 thread->cullface = GL_BACK;
5443         thread->colormask[0] = 1; 
5444                 thread->colormask[1] = 1;
5445                 thread->colormask[2] = 1;
5446                 thread->colormask[3] = 1;
5447                 thread->blendfunc[0] = GL_ONE;
5448                 thread->blendfunc[1] = GL_ZERO;
5449                 thread->depthmask = true;
5450                 thread->depthtest = true;
5451                 thread->depthfunc = GL_LEQUAL;
5452                 thread->scissortest = false;
5453                 thread->alphatest = false;
5454                 thread->alphafunc = GL_GREATER;
5455                 thread->alphavalue = 0.5f;
5456                 thread->viewport[0] = 0;
5457                 thread->viewport[1] = 0;
5458                 thread->viewport[2] = dpsoftrast.fb_width;
5459                 thread->viewport[3] = dpsoftrast.fb_height;
5460                 thread->scissor[0] = 0;
5461                 thread->scissor[1] = 0;
5462                 thread->scissor[2] = dpsoftrast.fb_width;
5463                 thread->scissor[3] = dpsoftrast.fb_height;
5464                 thread->depthrange[0] = 0;
5465                 thread->depthrange[1] = 1;
5466                 thread->polygonoffset[0] = 0;
5467                 thread->polygonoffset[1] = 0;
5468                 thread->clipplane[0] = 0;
5469                 thread->clipplane[1] = 0;
5470                 thread->clipplane[2] = 0;
5471                 thread->clipplane[3] = 1;
5472         
5473                 thread->numspans = 0;
5474                 thread->numtriangles = 0;
5475                 thread->commandoffset = 0;
5476                 thread->waiting = false;
5477                 thread->starving = false;
5478            
5479                 thread->validate = -1;
5480                 DPSOFTRAST_Validate(thread, -1);
5481  
5482                 if (dpsoftrast.usethreads)
5483                 {
5484                         thread->waitcond = Thread_CreateCond();
5485                         thread->drawcond = Thread_CreateCond();
5486                         thread->drawmutex = Thread_CreateMutex();
5487                         thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5488                 }
5489         }
5490         return 0;
5491 }
5492
5493 void DPSOFTRAST_Shutdown(void)
5494 {
5495         int i;
5496         if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5497         {
5498                 DPSOFTRAST_State_Thread *thread;
5499                 for (i = 0; i < dpsoftrast.numthreads; i++)
5500                 {
5501                         thread = &dpsoftrast.threads[i];
5502                         Thread_LockMutex(thread->drawmutex);
5503                         thread->index = -1;
5504                         Thread_CondSignal(thread->drawcond);
5505                         Thread_UnlockMutex(thread->drawmutex);
5506                         Thread_WaitThread(thread->thread, 0);
5507                         Thread_DestroyCond(thread->waitcond);
5508                         Thread_DestroyCond(thread->drawcond);
5509                         Thread_DestroyMutex(thread->drawmutex);
5510                 }
5511         }
5512         for (i = 0;i < dpsoftrast.texture_end;i++)
5513                 if (dpsoftrast.texture[i].bytes)
5514                         MM_FREE(dpsoftrast.texture[i].bytes);
5515         if (dpsoftrast.texture)
5516                 free(dpsoftrast.texture);
5517         if (dpsoftrast.threads)
5518                 MM_FREE(dpsoftrast.threads);
5519         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5520 }
5521