]> git.xonotic.org Git - xonotic/darkplaces.git/blob - dpsoftrast.c
use both .items2 and serverflags in items stat, to finally fix the runes
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "thread.h"
7 #include "dpsoftrast.h"
8
9 #ifdef _MSC_VER
10 #pragma warning(disable : 4324)
11 #endif
12
13 #ifndef __cplusplus
14 typedef qboolean bool;
15 #endif
16
17 #define ALIGN_SIZE 16
18 #define ATOMIC_SIZE 4
19
20 #ifdef SSE_POSSIBLE
21         #if defined(__APPLE__)
22                 #include <libkern/OSAtomic.h>
23                 #define ALIGN(var) var __attribute__((__aligned__(16)))
24                 #define ATOMIC(var) var __attribute__((__aligned__(4)))
25                 #define MEMORY_BARRIER (_mm_sfence())
26                 #define ATOMIC_COUNTER volatile int32_t 
27                 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28                 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29                 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30         #elif defined(__GNUC__) && defined(WIN32)
31                 #define ALIGN(var) var __attribute__((__aligned__(16)))
32                 #define ATOMIC(var) var __attribute__((__aligned__(4)))
33                 #define MEMORY_BARRIER (_mm_sfence())
34                 //(__sync_synchronize())
35                 #define ATOMIC_COUNTER volatile LONG
36                 // this LONG * cast serves to fix an issue with broken mingw
37                 // packages on Ubuntu; these only declare the function to take
38                 // a LONG *, causing a compile error here. This seems to be
39                 // error- and warn-free on platforms that DO declare
40                 // InterlockedIncrement correctly, like mingw on Windows.
41                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement((LONG *) &(counter)))
42                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement((LONG *) &(counter)))
43                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd((LONG *) &(counter), (val)))
44         #elif defined(__GNUC__)
45                 #define ALIGN(var) var __attribute__((__aligned__(16)))
46                 #define ATOMIC(var) var __attribute__((__aligned__(4)))
47                 #define MEMORY_BARRIER (_mm_sfence())
48                 //(__sync_synchronize())
49                 #define ATOMIC_COUNTER volatile int
50                 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
51                 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
52                 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
53         #elif defined(_MSC_VER)
54                 #define ALIGN(var) __declspec(align(16)) var
55                 #define ATOMIC(var) __declspec(align(4)) var
56                 #define MEMORY_BARRIER (_mm_sfence())
57                 //(MemoryBarrier())
58                 #define ATOMIC_COUNTER volatile LONG
59                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
60                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
61                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
62         #endif
63 #endif
64
65 #ifndef ALIGN
66 #define ALIGN(var) var
67 #endif
68 #ifndef ATOMIC
69 #define ATOMIC(var) var
70 #endif
71 #ifndef MEMORY_BARRIER
72 #define MEMORY_BARRIER ((void)0)
73 #endif
74 #ifndef ATOMIC_COUNTER
75 #define ATOMIC_COUNTER int
76 #endif
77 #ifndef ATOMIC_INCREMENT
78 #define ATOMIC_INCREMENT(counter) (++(counter))
79 #endif
80 #ifndef ATOMIC_DECREMENT
81 #define ATOMIC_DECREMENT(counter) (--(counter))
82 #endif
83 #ifndef ATOMIC_ADD
84 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
85 #endif
86
87 #ifdef SSE_POSSIBLE
88 #include <emmintrin.h>
89
90 #if defined(__GNUC__) && (__GNUC < 4 || __GNUC_MINOR__ < 6) && !defined(__clang__)
91         #define _mm_cvtss_f32(val) (__builtin_ia32_vec_ext_v4sf ((__v4sf)(val), 0))
92 #endif
93
94 #define MM_MALLOC(size) _mm_malloc(size, ALIGN_SIZE)
95
96 static void *MM_CALLOC(size_t nmemb, size_t size)
97 {
98         void *ptr = _mm_malloc(nmemb*size, ALIGN_SIZE);
99         if (ptr != NULL) memset(ptr, 0, nmemb*size);
100         return ptr;
101 }
102
103 #define MM_FREE _mm_free
104 #else
105 #define MM_MALLOC(size) malloc(size)
106 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
107 #define MM_FREE free
108 #endif
109
110 typedef enum DPSOFTRAST_ARRAY_e
111 {
112         DPSOFTRAST_ARRAY_POSITION,
113         DPSOFTRAST_ARRAY_COLOR,
114         DPSOFTRAST_ARRAY_TEXCOORD0,
115         DPSOFTRAST_ARRAY_TEXCOORD1,
116         DPSOFTRAST_ARRAY_TEXCOORD2,
117         DPSOFTRAST_ARRAY_TEXCOORD3,
118         DPSOFTRAST_ARRAY_TEXCOORD4,
119         DPSOFTRAST_ARRAY_TEXCOORD5,
120         DPSOFTRAST_ARRAY_TEXCOORD6,
121         DPSOFTRAST_ARRAY_TEXCOORD7,
122         DPSOFTRAST_ARRAY_TOTAL
123 }
124 DPSOFTRAST_ARRAY;
125
126 typedef struct DPSOFTRAST_Texture_s
127 {
128         int flags;
129         int width;
130         int height;
131         int depth;
132         int sides;
133         DPSOFTRAST_TEXTURE_FILTER filter;
134         int mipmaps;
135         int size;
136         ATOMIC_COUNTER binds;
137         unsigned char *bytes;
138         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
139 }
140 DPSOFTRAST_Texture;
141
142 #define COMMAND_SIZE ALIGN_SIZE
143 #define COMMAND_ALIGN(var) ALIGN(var)
144
145 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
146 {
147         unsigned char opcode;
148         unsigned short commandsize;
149 }
150 DPSOFTRAST_Command);
151
152 enum { DPSOFTRAST_OPCODE_Reset = 0 };
153
154 #define DEFCOMMAND(opcodeval, name, fields) \
155         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
156         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
157         { \
158                 unsigned char opcode; \
159                 unsigned short commandsize; \
160                 fields \
161         } DPSOFTRAST_Command_##name );
162
163 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
164 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
165
166 typedef ALIGN(struct DPSOFTRAST_State_Command_Pool_s
167 {
168         int freecommand;
169         int usedcommands;
170         ALIGN(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
171 }
172 DPSOFTRAST_State_Command_Pool);
173
174 typedef ALIGN(struct DPSOFTRAST_State_Triangle_s
175 {
176         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
177         float w[3];
178         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
179 }
180 DPSOFTRAST_State_Triangle);
181
182 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
183         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
184         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
185                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
186                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
187 }
188 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
189         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
190         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
191         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
192         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
193         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
194         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
195         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
196         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
197 }
198                                         
199 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
200
201 typedef ALIGN(struct DPSOFTRAST_State_Span_s
202 {
203         int triangle; // triangle this span was generated by
204         int x; // framebuffer x coord
205         int y; // framebuffer y coord
206         int startx; // usable range (according to pixelmask)
207         int endx; // usable range (according to pixelmask)
208         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
209         int depthbase; // depthbuffer value at x (add depthslope*startx to get first pixel's depthbuffer value)
210         int depthslope; // depthbuffer value pixel delta
211 }
212 DPSOFTRAST_State_Span);
213
214 #define DPSOFTRAST_DRAW_MAXSPANS 1024
215 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
216 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
217
218 #define DPSOFTRAST_VALIDATE_FB 1
219 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
220 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
221 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
222
223 typedef enum DPSOFTRAST_BLENDMODE_e
224 {
225         DPSOFTRAST_BLENDMODE_OPAQUE,
226         DPSOFTRAST_BLENDMODE_ALPHA,
227         DPSOFTRAST_BLENDMODE_ADDALPHA,
228         DPSOFTRAST_BLENDMODE_ADD,
229         DPSOFTRAST_BLENDMODE_INVMOD,
230         DPSOFTRAST_BLENDMODE_MUL,
231         DPSOFTRAST_BLENDMODE_MUL2,
232         DPSOFTRAST_BLENDMODE_SUBALPHA,
233         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
234         DPSOFTRAST_BLENDMODE_INVADD,
235         DPSOFTRAST_BLENDMODE_TOTAL
236 }
237 DPSOFTRAST_BLENDMODE;
238
239 typedef ALIGN(struct DPSOFTRAST_State_Thread_s
240 {
241         void *thread;
242         int index;
243         
244         int cullface;
245         int colormask[4];
246         int blendfunc[2];
247         int blendsubtract;
248         int depthmask;
249         int depthtest;
250         int depthfunc;
251         int scissortest;
252         int viewport[4];
253         int scissor[4];
254         float depthrange[2];
255         float polygonoffset[2];
256         float clipplane[4];
257         ALIGN(float fb_clipplane[4]);
258
259         int shader_mode;
260         int shader_permutation;
261         int shader_exactspecularmath;
262
263         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
264         
265         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
266         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
267
268         // DPSOFTRAST_VALIDATE_ flags
269         int validate;
270
271         // derived values (DPSOFTRAST_VALIDATE_FB)
272         int fb_colormask;
273         int fb_scissor[4];
274         ALIGN(float fb_viewportcenter[4]);
275         ALIGN(float fb_viewportscale[4]);
276
277         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
278         int fb_depthfunc;
279
280         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
281         int fb_blendmode;
282
283         // band boundaries
284         int miny1;
285         int maxy1;
286         int miny2;
287         int maxy2;
288
289         ATOMIC(volatile int commandoffset);
290
291         volatile bool waiting;
292         volatile bool starving;
293         void *waitcond;
294         void *drawcond;
295         void *drawmutex;
296
297         int numspans;
298         int numtriangles;
299         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
300         DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
301         unsigned char pixelmaskarray[DPSOFTRAST_DRAW_MAXSPANLENGTH+4]; // LordHavoc: padded to allow some termination bytes
302 }
303 DPSOFTRAST_State_Thread);
304
305 typedef ALIGN(struct DPSOFTRAST_State_s
306 {
307         int fb_width;
308         int fb_height;
309         unsigned int *fb_depthpixels;
310         unsigned int *fb_colorpixels[4];
311
312         int viewport[4];
313         ALIGN(float fb_viewportcenter[4]);
314         ALIGN(float fb_viewportscale[4]);
315
316         float color[4];
317         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
318         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
319
320         const float *pointer_vertex3f;
321         const float *pointer_color4f;
322         const unsigned char *pointer_color4ub;
323         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
324         int stride_vertex;
325         int stride_color;
326         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
327         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
328         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
329
330         int firstvertex;
331         int numvertices;
332         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
333         float *screencoord4f;
334         int drawstarty;
335         int drawendy;
336         int drawclipped;
337         
338         int shader_mode;
339         int shader_permutation;
340         int shader_exactspecularmath;
341
342         int texture_max;
343         int texture_end;
344         int texture_firstfree;
345         DPSOFTRAST_Texture *texture;
346
347         int bigendian;
348
349         // error reporting
350         const char *errorstring;
351
352         bool usethreads;
353         int interlace;
354         int numthreads;
355         DPSOFTRAST_State_Thread *threads;
356
357         ATOMIC(volatile int drawcommand);
358
359         DPSOFTRAST_State_Command_Pool commandpool;
360 }
361 DPSOFTRAST_State);
362
363 DPSOFTRAST_State dpsoftrast;
364
365 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
366 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
367 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
368 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
369
370 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span);
371 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span);
372
373 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
374 {
375         fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
376         fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
377         fb_viewportcenter[3] = 0.5f;
378         fb_viewportcenter[0] = 0.0f;
379         fb_viewportscale[1] = 0.5f * viewport[2];
380         fb_viewportscale[2] = -0.5f * viewport[3];
381         fb_viewportscale[3] = 0.5f;
382         fb_viewportscale[0] = 1.0f;
383 }
384
385 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
386 {
387         if (dpsoftrast.interlace)
388         {
389                 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
390                 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
391                 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
392                 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
393         }
394         else
395         {
396                 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
397                 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
398         }
399 }
400
401 static void DPSOFTRAST_RecalcClipPlane(DPSOFTRAST_State_Thread *thread)
402 {
403         thread->fb_clipplane[0] = thread->clipplane[0] / thread->fb_viewportscale[1];
404         thread->fb_clipplane[1] = thread->clipplane[1] / thread->fb_viewportscale[2];
405         thread->fb_clipplane[2] = thread->clipplane[2] / thread->fb_viewportscale[3];
406         thread->fb_clipplane[3] = thread->clipplane[3] / thread->fb_viewportscale[0];
407         thread->fb_clipplane[3] -= thread->fb_viewportcenter[1]*thread->fb_clipplane[0] + thread->fb_viewportcenter[2]*thread->fb_clipplane[1] + thread->fb_viewportcenter[3]*thread->fb_clipplane[2] + thread->fb_viewportcenter[0]*thread->fb_clipplane[3];
408 }
409
410 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
411 {
412         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
413         // and viewport projection values
414         int x1, x2;
415         int y1, y2;
416         x1 = thread->scissor[0];
417         x2 = thread->scissor[0] + thread->scissor[2];
418         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
419         y2 = dpsoftrast.fb_height - thread->scissor[1];
420         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
421         if (x1 < 0) x1 = 0;
422         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
423         if (y1 < 0) y1 = 0;
424         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
425         thread->fb_scissor[0] = x1;
426         thread->fb_scissor[1] = y1;
427         thread->fb_scissor[2] = x2 - x1;
428         thread->fb_scissor[3] = y2 - y1;
429
430         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
431         DPSOFTRAST_RecalcClipPlane(thread);
432         DPSOFTRAST_RecalcThread(thread);
433 }
434
435 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
436 {
437         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
438 }
439
440 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
441 {
442         if (thread->blendsubtract)
443         {
444                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
445                 {
446                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
447                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
448                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
449                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
450                 }
451         }
452         else
453         {       
454                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
455                 {
456                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
457                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
458                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
459                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
460                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
461                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
462                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
463                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
464                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
465                 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
466                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
467                 }
468         }
469 }
470
471 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
472
473 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
474 {
475         mask &= thread->validate;
476         if (!mask)
477                 return;
478         if (mask & DPSOFTRAST_VALIDATE_FB)
479         {
480                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
481                 DPSOFTRAST_RecalcFB(thread);
482         }
483         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
484         {
485                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
486                 DPSOFTRAST_RecalcDepthFunc(thread);
487         }
488         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
489         {
490                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
491                 DPSOFTRAST_RecalcBlendFunc(thread);
492         }
493 }
494
495 static DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
496 {
497         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
498                 return &dpsoftrast.texture[index];
499         return NULL;
500 }
501
502 static void DPSOFTRAST_Texture_Grow(void)
503 {
504         DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
505         DPSOFTRAST_State_Thread *thread;
506         int i;
507         int j;
508         DPSOFTRAST_Flush();
509         // expand texture array as needed
510         if (dpsoftrast.texture_max < 1024)
511                 dpsoftrast.texture_max = 1024;
512         else
513                 dpsoftrast.texture_max *= 2;
514         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
515         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
516                 if (dpsoftrast.texbound[i])
517                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
518         for (j = 0; j < dpsoftrast.numthreads; j++)
519         {
520                 thread = &dpsoftrast.threads[j];
521                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
522                         if (thread->texbound[i])
523                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
524         }
525 }
526
527 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
528 {
529         int w;
530         int h;
531         int d;
532         int size;
533         int s;
534         int texnum;
535         int mipmaps;
536         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
537         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
538         DPSOFTRAST_Texture *texture;
539         if (width*height*depth < 1)
540         {
541                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
542                 return 0;
543         }
544         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
545         {
546                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
547                 return 0;
548         }
549         switch(texformat)
550         {
551         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
552         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
553         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
554                 break;
555         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
556                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
557                 {
558                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
559                         return 0;
560                 }
561                 if (depth != 1)
562                 {
563                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
564                         return 0;
565                 }
566                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
567                 {
568                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
569                         return 0;
570                 }
571                 break;
572         }
573         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
574         {
575                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
576                 return 0;
577         }
578         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
579         {
580                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
581                 return 0;
582         }
583         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
584         {
585                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
586                 return 0;
587         }
588         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
589         {
590                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
591                 return 0;
592         }
593         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
594         {
595                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
596                 return 0;
597         }
598         // find first empty slot in texture array
599         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
600                 if (!dpsoftrast.texture[texnum].bytes)
601                         break;
602         dpsoftrast.texture_firstfree = texnum + 1;
603         if (dpsoftrast.texture_max <= texnum)
604                 DPSOFTRAST_Texture_Grow();
605         if (dpsoftrast.texture_end <= texnum)
606                 dpsoftrast.texture_end = texnum + 1;
607         texture = &dpsoftrast.texture[texnum];
608         memset(texture, 0, sizeof(*texture));
609         texture->flags = flags;
610         texture->width = width;
611         texture->height = height;
612         texture->depth = depth;
613         texture->sides = sides;
614         texture->binds = 0;
615         w = width;
616         h = height;
617         d = depth;
618         size = 0;
619         mipmaps = 0;
620         w = width;
621         h = height;
622         d = depth;
623         for (;;)
624         {
625                 s = w * h * d * sides * 4;
626                 texture->mipmap[mipmaps][0] = size;
627                 texture->mipmap[mipmaps][1] = s;
628                 texture->mipmap[mipmaps][2] = w;
629                 texture->mipmap[mipmaps][3] = h;
630                 texture->mipmap[mipmaps][4] = d;
631                 size += s;
632                 mipmaps++;
633                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
634                         break;
635                 if (w > 1) w >>= 1;
636                 if (h > 1) h >>= 1;
637                 if (d > 1) d >>= 1;
638         }
639         texture->mipmaps = mipmaps;
640         texture->size = size;
641
642         // allocate the pixels now
643         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
644
645         return texnum;
646 }
647 void DPSOFTRAST_Texture_Free(int index)
648 {
649         DPSOFTRAST_Texture *texture;
650         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
651         if (texture->binds)
652                 DPSOFTRAST_Flush();
653         if (texture->bytes)
654                 MM_FREE(texture->bytes);
655         texture->bytes = NULL;
656         memset(texture, 0, sizeof(*texture));
657         // adjust the free range and used range
658         if (dpsoftrast.texture_firstfree > index)
659                 dpsoftrast.texture_firstfree = index;
660         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
661                 dpsoftrast.texture_end--;
662 }
663 static void DPSOFTRAST_Texture_CalculateMipmaps(int index)
664 {
665         int i, x, y, z, w, layer0, layer1, row0, row1;
666         unsigned char *o, *i0, *i1, *i2, *i3;
667         DPSOFTRAST_Texture *texture;
668         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
669         if (texture->mipmaps <= 1)
670                 return;
671         for (i = 1;i < texture->mipmaps;i++)
672         {
673                 for (z = 0;z < texture->mipmap[i][4];z++)
674                 {
675                         layer0 = z*2;
676                         layer1 = z*2+1;
677                         if (layer1 >= texture->mipmap[i-1][4])
678                                 layer1 = texture->mipmap[i-1][4]-1;
679                         for (y = 0;y < texture->mipmap[i][3];y++)
680                         {
681                                 row0 = y*2;
682                                 row1 = y*2+1;
683                                 if (row1 >= texture->mipmap[i-1][3])
684                                         row1 = texture->mipmap[i-1][3]-1;
685                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
686                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
687                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
688                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
689                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
690                                 w = texture->mipmap[i][2];
691                                 if (layer1 > layer0)
692                                 {
693                                         if (texture->mipmap[i-1][2] > 1)
694                                         {
695                                                 // average 3D texture
696                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
697                                                 {
698                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
699                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
700                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
701                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
702                                                 }
703                                         }
704                                         else
705                                         {
706                                                 // average 3D mipmap with parent width == 1
707                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
708                                                 {
709                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
710                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
711                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
712                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
713                                                 }
714                                         }
715                                 }
716                                 else
717                                 {
718                                         if (texture->mipmap[i-1][2] > 1)
719                                         {
720                                                 // average 2D texture (common case)
721                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
722                                                 {
723                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
724                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
725                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
726                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
727                                                 }
728                                         }
729                                         else
730                                         {
731                                                 // 2D texture with parent width == 1
732                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
733                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
734                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
735                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
736                                         }
737                                 }
738                         }
739                 }
740         }
741 }
742 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
743 {
744         DPSOFTRAST_Texture *texture;
745         unsigned char *dst;
746         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
747         if (texture->binds)
748                 DPSOFTRAST_Flush();
749         if (pixels)
750         {
751                 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
752                 while (blockheight > 0)
753                 {
754                         memcpy(dst, pixels, blockwidth * 4);
755                         pixels += blockwidth * 4;
756                         dst += texture->mipmap[0][2] * 4;
757                         blockheight--;
758                 }
759         }
760         DPSOFTRAST_Texture_CalculateMipmaps(index);
761 }
762 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
763 {
764         DPSOFTRAST_Texture *texture;
765         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
766         if (texture->binds)
767                 DPSOFTRAST_Flush();
768         if (pixels)
769                 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
770         DPSOFTRAST_Texture_CalculateMipmaps(index);
771 }
772 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
773 {
774         DPSOFTRAST_Texture *texture;
775         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
776         return texture->mipmap[mip][2];
777 }
778 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
779 {
780         DPSOFTRAST_Texture *texture;
781         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
782         return texture->mipmap[mip][3];
783 }
784 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
785 {
786         DPSOFTRAST_Texture *texture;
787         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
788         return texture->mipmap[mip][4];
789 }
790 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
791 {
792         DPSOFTRAST_Texture *texture;
793         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
794         if (texture->binds)
795                 DPSOFTRAST_Flush();
796         return texture->bytes + texture->mipmap[mip][0];
797 }
798 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
799 {
800         DPSOFTRAST_Texture *texture;
801         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
802         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
803         {
804                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
805                 return;
806         }
807         if (texture->binds)
808                 DPSOFTRAST_Flush();
809         texture->filter = filter;
810 }
811
812 static void DPSOFTRAST_Draw_FlushThreads(void);
813
814 static void DPSOFTRAST_Draw_SyncCommands(void)
815 {
816         if(dpsoftrast.usethreads) MEMORY_BARRIER;
817         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
818 }
819
820 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
821 {
822         DPSOFTRAST_State_Thread *thread;
823         int i;
824         int freecommand = dpsoftrast.commandpool.freecommand;
825         int usedcommands = dpsoftrast.commandpool.usedcommands;
826         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
827                 return;
828         DPSOFTRAST_Draw_SyncCommands();
829         for(;;)
830         {
831                 int waitindex = -1;
832                 int commandoffset;
833                 usedcommands = 0;
834                 for (i = 0; i < dpsoftrast.numthreads; i++)
835                 {
836                         thread = &dpsoftrast.threads[i]; 
837                         commandoffset = freecommand - thread->commandoffset;
838                         if (commandoffset < 0)
839                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
840                         if (commandoffset > usedcommands)
841                         {
842                                 waitindex = i;
843                                 usedcommands = commandoffset;
844                         }
845                 }
846                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
847                         break;
848                 thread = &dpsoftrast.threads[waitindex];
849                 Thread_LockMutex(thread->drawmutex);
850                 if (thread->commandoffset != dpsoftrast.drawcommand)
851                 {
852                         thread->waiting = true;
853                         if (thread->starving) Thread_CondSignal(thread->drawcond);
854                         Thread_CondWait(thread->waitcond, thread->drawmutex);
855                         thread->waiting = false;
856                 }
857                 Thread_UnlockMutex(thread->drawmutex);
858         }
859         dpsoftrast.commandpool.usedcommands = usedcommands;
860 }
861
862 #define DPSOFTRAST_ALIGNCOMMAND(size) \
863         ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
864 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
865         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
866
867 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
868 {
869         DPSOFTRAST_Command *command;
870         int freecommand = dpsoftrast.commandpool.freecommand;
871         int usedcommands = dpsoftrast.commandpool.usedcommands;
872         int extra = sizeof(DPSOFTRAST_Command);
873         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
874                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
875         if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
876         {
877                 if (dpsoftrast.usethreads)
878                         DPSOFTRAST_Draw_FreeCommandPool(size + extra);
879                 else
880                         DPSOFTRAST_Draw_FlushThreads();
881                 freecommand = dpsoftrast.commandpool.freecommand;
882                 usedcommands = dpsoftrast.commandpool.usedcommands;
883         }
884         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
885         {
886                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
887                 command->opcode = DPSOFTRAST_OPCODE_Reset;
888                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
889                 freecommand = 0;
890         }
891         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
892         command->opcode = opcode;
893         command->commandsize = size;
894         freecommand += size;
895         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
896                 freecommand = 0;
897         dpsoftrast.commandpool.freecommand = freecommand;
898         dpsoftrast.commandpool.usedcommands = usedcommands + size;
899         return command;
900 }
901
902 static void DPSOFTRAST_UndoCommand(int size)
903 {
904         int freecommand = dpsoftrast.commandpool.freecommand;
905         int usedcommands = dpsoftrast.commandpool.usedcommands;
906         freecommand -= size;
907         if (freecommand < 0)
908                 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
909         usedcommands -= size;
910         dpsoftrast.commandpool.freecommand = freecommand;
911         dpsoftrast.commandpool.usedcommands = usedcommands;
912 }
913                 
914 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
915 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
916 {
917         thread->viewport[0] = command->x;
918         thread->viewport[1] = command->y;
919         thread->viewport[2] = command->width;
920         thread->viewport[3] = command->height;
921         thread->validate |= DPSOFTRAST_VALIDATE_FB;
922 }
923 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
924 {
925         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
926         command->x = x;
927         command->y = y;
928         command->width = width;
929         command->height = height;
930
931         dpsoftrast.viewport[0] = x;
932         dpsoftrast.viewport[1] = y;
933         dpsoftrast.viewport[2] = width;
934         dpsoftrast.viewport[3] = height;
935         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
936 }
937
938 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
939 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
940 {
941         int i, x1, y1, x2, y2, w, h, x, y;
942         int miny1, maxy1, miny2, maxy2;
943         int bandy;
944         unsigned int *p;
945         unsigned int c;
946         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
947         miny1 = thread->miny1;
948         maxy1 = thread->maxy1;
949         miny2 = thread->miny2;
950         maxy2 = thread->maxy2;
951         x1 = thread->fb_scissor[0];
952         y1 = thread->fb_scissor[1];
953         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
954         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
955         if (y1 < miny1) y1 = miny1;
956         if (y2 > maxy2) y2 = maxy2;
957         w = x2 - x1;
958         h = y2 - y1;
959         if (w < 1 || h < 1)
960                 return;
961         // FIXME: honor fb_colormask?
962         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
963         for (i = 0;i < 4;i++)
964         {
965                 if (!dpsoftrast.fb_colorpixels[i])
966                         continue;
967                 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
968                 for (;y < bandy;y++)
969                 {
970                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
971                         for (x = x1;x < x2;x++)
972                                 p[x] = c;
973                 }
974         }
975 }
976 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
977 {
978         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
979         command->r = r;
980         command->g = g;
981         command->b = b;
982         command->a = a;
983 }
984
985 DEFCOMMAND(3, ClearDepth, float depth;)
986 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
987 {
988         int x1, y1, x2, y2, w, h, x, y;
989         int miny1, maxy1, miny2, maxy2;
990         int bandy;
991         unsigned int *p;
992         unsigned int c;
993         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
994         miny1 = thread->miny1;
995         maxy1 = thread->maxy1;
996         miny2 = thread->miny2;
997         maxy2 = thread->maxy2;
998         x1 = thread->fb_scissor[0];
999         y1 = thread->fb_scissor[1];
1000         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
1001         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
1002         if (y1 < miny1) y1 = miny1;
1003         if (y2 > maxy2) y2 = maxy2;
1004         w = x2 - x1;
1005         h = y2 - y1;
1006         if (w < 1 || h < 1)
1007                 return;
1008         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
1009         for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
1010         for (;y < bandy;y++)
1011         {
1012                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
1013                 for (x = x1;x < x2;x++)
1014                         p[x] = c;
1015         }
1016 }
1017 void DPSOFTRAST_ClearDepth(float d)
1018 {
1019         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
1020         command->depth = d;
1021 }
1022
1023 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
1024 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
1025 {
1026         thread->colormask[0] = command->r != 0;
1027         thread->colormask[1] = command->g != 0;
1028         thread->colormask[2] = command->b != 0;
1029         thread->colormask[3] = command->a != 0;
1030         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
1031 }
1032 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1033 {
1034         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1035         command->r = r;
1036         command->g = g;
1037         command->b = b;
1038         command->a = a;
1039 }
1040
1041 DEFCOMMAND(5, DepthTest, int enable;)
1042 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1043 {
1044         thread->depthtest = command->enable;
1045         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1046 }
1047 void DPSOFTRAST_DepthTest(int enable)
1048 {
1049         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1050         command->enable = enable;
1051 }
1052
1053 DEFCOMMAND(6, ScissorTest, int enable;)
1054 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1055 {
1056         thread->scissortest = command->enable;
1057         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1058 }
1059 void DPSOFTRAST_ScissorTest(int enable)
1060 {
1061         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1062         command->enable = enable;
1063 }
1064
1065 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1066 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1067 {
1068         thread->scissor[0] = command->x;
1069         thread->scissor[1] = command->y;
1070         thread->scissor[2] = command->width;
1071         thread->scissor[3] = command->height;
1072         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1073 }
1074 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1075 {
1076         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1077         command->x = x;
1078         command->y = y;
1079         command->width = width;
1080         command->height = height;
1081 }
1082
1083 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1084 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1085 {
1086         thread->blendfunc[0] = command->sfactor;
1087         thread->blendfunc[1] = command->dfactor;
1088         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1089 }
1090 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1091 {
1092         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1093         command->sfactor = sfactor;
1094         command->dfactor = dfactor;
1095 }
1096
1097 DEFCOMMAND(9, BlendSubtract, int enable;)
1098 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1099 {
1100         thread->blendsubtract = command->enable;
1101         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1102 }
1103 void DPSOFTRAST_BlendSubtract(int enable)
1104 {
1105         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1106         command->enable = enable;
1107 }
1108
1109 DEFCOMMAND(10, DepthMask, int enable;)
1110 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1111 {
1112         thread->depthmask = command->enable;
1113 }
1114 void DPSOFTRAST_DepthMask(int enable)
1115 {
1116         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1117         command->enable = enable;
1118 }
1119
1120 DEFCOMMAND(11, DepthFunc, int func;)
1121 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1122 {
1123         thread->depthfunc = command->func;
1124 }
1125 void DPSOFTRAST_DepthFunc(int func)
1126 {
1127         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1128         command->func = func;
1129 }
1130
1131 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1132 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1133 {
1134         thread->depthrange[0] = command->nearval;
1135         thread->depthrange[1] = command->farval;
1136 }
1137 void DPSOFTRAST_DepthRange(float nearval, float farval)
1138 {
1139         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1140         command->nearval = nearval;
1141         command->farval = farval;
1142 }
1143
1144 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1145 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1146 {
1147         thread->polygonoffset[0] = command->alongnormal;
1148         thread->polygonoffset[1] = command->intoview;
1149 }
1150 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1151 {
1152         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1153         command->alongnormal = alongnormal;
1154         command->intoview = intoview;
1155 }
1156
1157 DEFCOMMAND(14, CullFace, int mode;)
1158 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1159 {
1160         thread->cullface = command->mode;
1161 }
1162 void DPSOFTRAST_CullFace(int mode)
1163 {
1164         DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1165         command->mode = mode;
1166 }
1167
1168 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1169 {
1170         dpsoftrast.color[0] = r;
1171         dpsoftrast.color[1] = g;
1172         dpsoftrast.color[2] = b;
1173         dpsoftrast.color[3] = a;
1174 }
1175
1176 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1177 {
1178         int outstride = blockwidth * 4;
1179         int instride = dpsoftrast.fb_width * 4;
1180         int bx1 = blockx;
1181         int by1 = blocky;
1182         int bx2 = blockx + blockwidth;
1183         int by2 = blocky + blockheight;
1184         int bw;
1185         int x;
1186         int y;
1187         unsigned char *inpixels;
1188         unsigned char *b;
1189         unsigned char *o;
1190         DPSOFTRAST_Flush();
1191         if (bx1 < 0) bx1 = 0;
1192         if (by1 < 0) by1 = 0;
1193         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1194         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1195         bw = bx2 - bx1;
1196         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1197         if (dpsoftrast.bigendian)
1198         {
1199                 for (y = by1;y < by2;y++)
1200                 {
1201                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1202                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1203                         for (x = bx1;x < bx2;x++)
1204                         {
1205                                 o[0] = b[3];
1206                                 o[1] = b[2];
1207                                 o[2] = b[1];
1208                                 o[3] = b[0];
1209                                 o += 4;
1210                                 b += 4;
1211                         }
1212                 }
1213         }
1214         else
1215         {
1216                 for (y = by1;y < by2;y++)
1217                 {
1218                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1219                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1220                         memcpy(o, b, bw*4);
1221                 }
1222         }
1223
1224 }
1225 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1226 {
1227         int tx1 = tx;
1228         int ty1 = ty;
1229         int tx2 = tx + width;
1230         int ty2 = ty + height;
1231         int sx1 = sx;
1232         int sy1 = sy;
1233         int sx2 = sx + width;
1234         int sy2 = sy + height;
1235         int swidth;
1236         int sheight;
1237         int twidth;
1238         int theight;
1239         int sw;
1240         int sh;
1241         int tw;
1242         int th;
1243         int y;
1244         unsigned int *spixels;
1245         unsigned int *tpixels;
1246         DPSOFTRAST_Texture *texture;
1247         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1248         if (mip < 0 || mip >= texture->mipmaps) return;
1249         DPSOFTRAST_Flush();
1250         spixels = dpsoftrast.fb_colorpixels[0];
1251         swidth = dpsoftrast.fb_width;
1252         sheight = dpsoftrast.fb_height;
1253         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1254         twidth = texture->mipmap[mip][2];
1255         theight = texture->mipmap[mip][3];
1256         if (tx1 < 0) tx1 = 0;
1257         if (ty1 < 0) ty1 = 0;
1258         if (tx2 > twidth) tx2 = twidth;
1259         if (ty2 > theight) ty2 = theight;
1260         if (sx1 < 0) sx1 = 0;
1261         if (sy1 < 0) sy1 = 0;
1262         if (sx2 > swidth) sx2 = swidth;
1263         if (sy2 > sheight) sy2 = sheight;
1264         tw = tx2 - tx1;
1265         th = ty2 - ty1;
1266         sw = sx2 - sx1;
1267         sh = sy2 - sy1;
1268         if (tw > sw) tw = sw;
1269         if (th > sh) th = sh;
1270         if (tw < 1 || th < 1)
1271                 return;
1272         sy1 = sheight - 1 - sy1;
1273         for (y = 0;y < th;y++)
1274                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1275         if (texture->mipmaps > 1)
1276                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1277 }
1278
1279 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1280 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1281 {
1282         if (thread->texbound[command->unitnum])
1283                 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1284         thread->texbound[command->unitnum] = command->texture;
1285 }
1286 void DPSOFTRAST_SetTexture(int unitnum, int index)
1287 {
1288         DPSOFTRAST_Command_SetTexture *command;
1289         DPSOFTRAST_Texture *texture;
1290         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1291         {
1292                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1293                 return;
1294         }
1295         texture = DPSOFTRAST_Texture_GetByIndex(index);
1296         if (index && !texture)
1297         {
1298                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1299                 return;
1300         }
1301
1302         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1303         command->unitnum = unitnum;
1304         command->texture = texture;
1305
1306         dpsoftrast.texbound[unitnum] = texture;
1307         if (texture)
1308                 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1309 }
1310
1311 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1312 {
1313         dpsoftrast.pointer_vertex3f = vertex3f;
1314         dpsoftrast.stride_vertex = stride;
1315 }
1316 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1317 {
1318         dpsoftrast.pointer_color4f = color4f;
1319         dpsoftrast.pointer_color4ub = NULL;
1320         dpsoftrast.stride_color = stride;
1321 }
1322 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1323 {
1324         dpsoftrast.pointer_color4f = NULL;
1325         dpsoftrast.pointer_color4ub = color4ub;
1326         dpsoftrast.stride_color = stride;
1327 }
1328 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1329 {
1330         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1331         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1332         dpsoftrast.stride_texcoord[unitnum] = stride;
1333 }
1334
1335 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1336 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1337 {
1338         thread->shader_mode = command->mode;
1339         thread->shader_permutation = command->permutation;
1340         thread->shader_exactspecularmath = command->exactspecularmath;
1341 }
1342 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1343 {
1344         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1345         command->mode = mode;
1346         command->permutation = permutation;
1347         command->exactspecularmath = exactspecularmath;
1348
1349         dpsoftrast.shader_mode = mode;
1350         dpsoftrast.shader_permutation = permutation;
1351         dpsoftrast.shader_exactspecularmath = exactspecularmath;
1352 }
1353
1354 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1355 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1356 {
1357         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1358 }
1359 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1360 {
1361         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1362         command->index = index;
1363         command->val[0] = v0;
1364         command->val[1] = v1;
1365         command->val[2] = v2;
1366         command->val[3] = v3;
1367
1368         dpsoftrast.uniform4f[index*4+0] = v0;
1369         dpsoftrast.uniform4f[index*4+1] = v1;
1370         dpsoftrast.uniform4f[index*4+2] = v2;
1371         dpsoftrast.uniform4f[index*4+3] = v3;
1372 }
1373 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1374 {
1375         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1376         command->index = index;
1377         memcpy(command->val, v, sizeof(command->val));
1378
1379         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1380 }
1381
1382 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1383 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1384 {
1385         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1386 }
1387 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1388 {
1389 #ifdef SSE_POSSIBLE
1390         int i, index;
1391         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1392         {
1393                 __m128 m0, m1, m2, m3;
1394                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1395                 command->index = (DPSOFTRAST_UNIFORM)index;
1396                 if (((size_t)v)&(ALIGN_SIZE-1))
1397                 {
1398                         m0 = _mm_loadu_ps(v);
1399                         m1 = _mm_loadu_ps(v+4);
1400                         m2 = _mm_loadu_ps(v+8);
1401                         m3 = _mm_loadu_ps(v+12);
1402                 }
1403                 else
1404                 {
1405                         m0 = _mm_load_ps(v);
1406                         m1 = _mm_load_ps(v+4);
1407                         m2 = _mm_load_ps(v+8);
1408                         m3 = _mm_load_ps(v+12);
1409                 }
1410                 if (transpose)
1411                 {
1412                         __m128 t0, t1, t2, t3;
1413                         t0 = _mm_unpacklo_ps(m0, m1);
1414                         t1 = _mm_unpacklo_ps(m2, m3);
1415                         t2 = _mm_unpackhi_ps(m0, m1);
1416                         t3 = _mm_unpackhi_ps(m2, m3);
1417                         m0 = _mm_movelh_ps(t0, t1);
1418                         m1 = _mm_movehl_ps(t1, t0);
1419                         m2 = _mm_movelh_ps(t2, t3);
1420                         m3 = _mm_movehl_ps(t3, t2);                     
1421                 }
1422                 _mm_store_ps(command->val, m0);
1423                 _mm_store_ps(command->val+4, m1);
1424                 _mm_store_ps(command->val+8, m2);
1425                 _mm_store_ps(command->val+12, m3);
1426                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1427                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1428                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1429                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1430         }
1431 #endif
1432 }
1433
1434 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1435 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1436 {
1437         thread->uniform1i[command->index] = command->val;
1438 }
1439 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1440 {
1441         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1442         command->index = index;
1443         command->val = i0;
1444
1445         dpsoftrast.uniform1i[command->index] = i0;
1446 }
1447
1448 DEFCOMMAND(24, ClipPlane, float clipplane[4];)
1449 static void DPSOFTRAST_Interpret_ClipPlane(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClipPlane *command)
1450 {
1451         memcpy(thread->clipplane, command->clipplane, 4*sizeof(float));
1452         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1453 }
1454 void DPSOFTRAST_ClipPlane(float x, float y, float z, float w)
1455 {
1456         DPSOFTRAST_Command_ClipPlane *command = DPSOFTRAST_ALLOCATECOMMAND(ClipPlane);
1457         command->clipplane[0] = x;
1458         command->clipplane[1] = y;
1459         command->clipplane[2] = z;
1460         command->clipplane[3] = w;
1461 }
1462
1463 #ifdef SSE_POSSIBLE
1464 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1465 {
1466         float *end = dst + size*4;
1467         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1468         {
1469                 while (dst < end)
1470                 {
1471                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1472                         dst += 4;
1473                         src += stride;
1474                 }
1475         }
1476         else
1477         {
1478                 while (dst < end)
1479                 {
1480                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1481                         dst += 4;
1482                         src += stride;
1483                 }
1484         }
1485 }
1486
1487 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1488 {
1489         float *end = dst + size*4;
1490         if (stride == sizeof(float[3]))
1491         {
1492                 float *end4 = dst + (size&~3)*4;        
1493                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1494                 {
1495                         while (dst < end4)
1496                         {
1497                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1498                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1499                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1500                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1501                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1502                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1503                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1504                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1505                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1506                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1507                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1508                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1509                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1510                                 dst += 16;
1511                                 src += 4*sizeof(float[3]);
1512                         }
1513                 }
1514                 else
1515                 {
1516                         while (dst < end4)
1517                         {
1518                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1519                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1520                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1521                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1522                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1523                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1524                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1525                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1526                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1527                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1528                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1529                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1530                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1531                                 dst += 16;
1532                                 src += 4*sizeof(float[3]);
1533                         }
1534                 }
1535         }
1536         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1537         {
1538                 while (dst < end)
1539                 {
1540                         __m128 v = _mm_loadu_ps((const float *)src);
1541                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1542                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1543                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1544                         _mm_store_ps(dst, v);
1545                         dst += 4;
1546                         src += stride;
1547                 }
1548         }
1549         else
1550         {
1551                 while (dst < end)
1552                 {
1553                         __m128 v = _mm_load_ps((const float *)src);
1554                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1555                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1556                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1557                         _mm_store_ps(dst, v);
1558                         dst += 4;
1559                         src += stride;
1560                 }
1561         }
1562 }
1563
1564 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1565 {
1566         float *end = dst + size*4;
1567         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1568         if (stride == sizeof(float[2]))
1569         {
1570                 float *end2 = dst + (size&~1)*4;
1571                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1572                 {
1573                         while (dst < end2)
1574                         {
1575                                 __m128 v = _mm_loadu_ps((const float *)src);
1576                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1577                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1578                                 dst += 8;
1579                                 src += 2*sizeof(float[2]);
1580                         }
1581                 }
1582                 else
1583                 {
1584                         while (dst < end2)
1585                         {
1586                                 __m128 v = _mm_load_ps((const float *)src);
1587                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1588                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1589                                 dst += 8;
1590                                 src += 2*sizeof(float[2]);
1591                         }
1592                 }
1593         }
1594         while (dst < end)
1595         {
1596                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1597                 dst += 4;
1598                 src += stride;
1599         }
1600 }
1601
1602 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1603 {
1604         float *end = dst + size*4;
1605         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1606         if (stride == sizeof(unsigned char[4]))
1607         {
1608                 float *end4 = dst + (size&~3)*4;
1609                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1610                 {
1611                         while (dst < end4)
1612                         {
1613                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1614                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1615                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1616                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1617                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1618                                 dst += 16;
1619                                 src += 4*sizeof(unsigned char[4]);
1620                         }
1621                 }
1622                 else
1623                 {
1624                         while (dst < end4)
1625                         {
1626                                 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1627                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1628                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1629                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1630                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1631                                 dst += 16;
1632                                 src += 4*sizeof(unsigned char[4]);
1633                         }
1634                 }
1635         }
1636         while (dst < end)
1637         {
1638                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1639                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1640                 dst += 4;
1641                 src += stride;
1642         }
1643 }
1644
1645 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1646 {
1647         float *end = dst + 4*size;
1648         __m128 v = _mm_loadu_ps(src);
1649         while (dst < end)
1650         {
1651                 _mm_store_ps(dst, v);
1652                 dst += 4;
1653         }
1654 }
1655 #endif
1656
1657 static void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1658 {
1659 #ifdef SSE_POSSIBLE
1660         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1661         __m128 m0, m1, m2, m3;
1662         float *end;
1663         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1664         {
1665                 // fast case for identity matrix
1666                 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1667                 return;
1668         }
1669         end = out4f + numitems*4;
1670         m0 = _mm_loadu_ps(inmatrix16f);
1671         m1 = _mm_loadu_ps(inmatrix16f + 4);
1672         m2 = _mm_loadu_ps(inmatrix16f + 8);
1673         m3 = _mm_loadu_ps(inmatrix16f + 12);
1674         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1675         {
1676                 while (out4f < end)
1677                 {
1678                         __m128 v = _mm_loadu_ps(in4f);
1679                         _mm_store_ps(out4f,
1680                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1681                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1682                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1683                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1684                         out4f += 4;
1685                         in4f += 4;
1686                 }
1687         }
1688         else
1689         {
1690                 while (out4f < end)
1691                 {
1692                         __m128 v = _mm_load_ps(in4f);
1693                         _mm_store_ps(out4f,
1694                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1695                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1696                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1697                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1698                         out4f += 4;
1699                         in4f += 4;
1700                 }
1701         }
1702 #endif
1703 }
1704
1705 #if 0
1706 static void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1707 {
1708         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1709 }
1710 #endif
1711
1712 #ifdef SSE_POSSIBLE
1713 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1714 { \
1715         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1716         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1717         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1718         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1719 }
1720
1721 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1722 { \
1723         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1724         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1725         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1726         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1727 }
1728
1729 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1730 { \
1731         __m128 p = (in); \
1732         out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1733                                                   _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1734                                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1735                                                                                         _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1736 }
1737
1738 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1739 {
1740         int clipmask = 0xFF;
1741         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1742         __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1743         __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1744         __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1745         m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1746         m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1747         m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1748         m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1749         #define BBFRONT(k, pos) \
1750         { \
1751                 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1752                 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1753                 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1754                 { \
1755                         __m128 proj; \
1756                         clipmask &= ~(1<<k); \
1757                         proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1758                         minproj = _mm_min_ss(minproj, proj); \
1759                         maxproj = _mm_max_ss(maxproj, proj); \
1760                 } \
1761         }
1762         BBFRONT(0, minpos); 
1763         BBFRONT(1, _mm_move_ss(minpos, maxpos)); 
1764         BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1765         BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1766         BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1767         BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1768         BBFRONT(6, _mm_move_ss(maxpos, minpos)); 
1769         BBFRONT(7, maxpos);
1770         #define BBCLIP(k) \
1771         { \
1772                 if (clipmask&(1<<k)) \
1773                 { \
1774                         if (!(clipmask&(1<<(k^1)))) \
1775                         { \
1776                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1777                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1778                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1779                                 minproj = _mm_min_ss(minproj, proj); \
1780                                 maxproj = _mm_max_ss(maxproj, proj); \
1781                         } \
1782                         if (!(clipmask&(1<<(k^2)))) \
1783                         { \
1784                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1785                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1786                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1787                                 minproj = _mm_min_ss(minproj, proj); \
1788                                 maxproj = _mm_max_ss(maxproj, proj); \
1789                         } \
1790                         if (!(clipmask&(1<<(k^4)))) \
1791                         { \
1792                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1793                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1794                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1795                                 minproj = _mm_min_ss(minproj, proj); \
1796                                 maxproj = _mm_max_ss(maxproj, proj); \
1797                         } \
1798                 } \
1799         }
1800         BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1801         viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1802         viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1803         minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1804         maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1805         minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1806         maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1807         *starty = _mm_cvttss_si32(maxproj);
1808         *endy = _mm_cvttss_si32(minproj)+1;
1809         return clipmask;
1810 }
1811         
1812 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1813 {
1814         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1815         float *end = out4f + numitems*4;
1816         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1817         __m128 minpos, maxpos;
1818         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1819         {
1820                 minpos = maxpos = _mm_loadu_ps(in4f);
1821                 while (out4f < end)
1822                 {
1823                         __m128 v = _mm_loadu_ps(in4f);
1824                         minpos = _mm_min_ps(minpos, v);
1825                         maxpos = _mm_max_ps(maxpos, v);
1826                         _mm_store_ps(out4f, v);
1827                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1828                         _mm_store_ps(screen4f, v);
1829                         in4f += 4;
1830                         out4f += 4;
1831                         screen4f += 4;
1832                 }
1833         }
1834         else
1835         {
1836                 minpos = maxpos = _mm_load_ps(in4f);
1837                 while (out4f < end)
1838                 {
1839                         __m128 v = _mm_load_ps(in4f);
1840                         minpos = _mm_min_ps(minpos, v);
1841                         maxpos = _mm_max_ps(maxpos, v);
1842                         _mm_store_ps(out4f, v);
1843                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1844                         _mm_store_ps(screen4f, v);
1845                         in4f += 4;
1846                         out4f += 4;
1847                         screen4f += 4;
1848                 }
1849         }
1850         if (starty && endy) 
1851         {
1852                 ALIGN(float minposf[4]);
1853                 ALIGN(float maxposf[4]);
1854                 _mm_store_ps(minposf, minpos);
1855                 _mm_store_ps(maxposf, maxpos);
1856                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1857         }
1858         return 0;
1859 }
1860
1861 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1862 {
1863         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1864         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1865         float *end;
1866         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1867                 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1868         end = out4f + numitems*4;
1869         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1870         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1871         m0 = _mm_loadu_ps(inmatrix16f);
1872         m1 = _mm_loadu_ps(inmatrix16f + 4);
1873         m2 = _mm_loadu_ps(inmatrix16f + 8);
1874         m3 = _mm_loadu_ps(inmatrix16f + 12);
1875         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1876         {
1877                 minpos = maxpos = _mm_loadu_ps(in4f);
1878                 while (out4f < end)
1879                 {
1880                         __m128 v = _mm_loadu_ps(in4f);
1881                         minpos = _mm_min_ps(minpos, v);
1882                         maxpos = _mm_max_ps(maxpos, v);
1883                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1884                         _mm_store_ps(out4f, v);
1885                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1886                         _mm_store_ps(screen4f, v);
1887                         in4f += 4;
1888                         out4f += 4;
1889                         screen4f += 4;
1890                 }
1891         }
1892         else
1893         {
1894                 minpos = maxpos = _mm_load_ps(in4f);
1895                 while (out4f < end)
1896                 {
1897                         __m128 v = _mm_load_ps(in4f);
1898                         minpos = _mm_min_ps(minpos, v);
1899                         maxpos = _mm_max_ps(maxpos, v);
1900                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1901                         _mm_store_ps(out4f, v);
1902                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1903                         _mm_store_ps(screen4f, v);
1904                         in4f += 4;
1905                         out4f += 4;
1906                         screen4f += 4;
1907                 }
1908         }
1909         if (starty && endy) 
1910         {
1911                 ALIGN(float minposf[4]);
1912                 ALIGN(float maxposf[4]);
1913                 _mm_store_ps(minposf, minpos);
1914                 _mm_store_ps(maxposf, maxpos);
1915                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f); 
1916         }
1917         return 0;
1918 }
1919 #endif
1920
1921 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1922 {
1923 #ifdef SSE_POSSIBLE
1924         float *outf = dpsoftrast.post_array4f[outarray];
1925         const unsigned char *inb;
1926         int firstvertex = dpsoftrast.firstvertex;
1927         int numvertices = dpsoftrast.numvertices;
1928         int stride;
1929         switch(inarray)
1930         {
1931         case DPSOFTRAST_ARRAY_POSITION:
1932                 stride = dpsoftrast.stride_vertex;
1933                 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1934                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1935                 break;
1936         case DPSOFTRAST_ARRAY_COLOR:
1937                 stride = dpsoftrast.stride_color;
1938                 if (dpsoftrast.pointer_color4f)
1939                 {
1940                         inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1941                         DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1942                 }
1943                 else if (dpsoftrast.pointer_color4ub)
1944                 {
1945                         stride = dpsoftrast.stride_color;
1946                         inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1947                         DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1948                 }
1949                 else
1950                 {
1951                         DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1952                 }
1953                 break;
1954         default:
1955                 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1956                 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1957                 {
1958                         inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1959                         switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1960                         {
1961                         case 2:
1962                                 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1963                                 break;
1964                         case 3:
1965                                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1966                                 break;
1967                         case 4:
1968                                 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1969                                 break;
1970                         }
1971                 }
1972                 break;
1973         }
1974         return outf;
1975 #else
1976         return NULL;
1977 #endif
1978 }
1979
1980 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1981 {
1982         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1983         DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1984         return data;
1985 }
1986
1987 #if 0
1988 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1989 {
1990 #ifdef SSE_POSSIBLE
1991         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1992         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1993         return data;
1994 #else
1995         return NULL;
1996 #endif
1997 }
1998 #endif
1999
2000 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
2001 {
2002 #ifdef SSE_POSSIBLE
2003         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2004         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
2005         return data;
2006 #else
2007         return NULL;
2008 #endif
2009 }
2010
2011 static void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
2012 {
2013         int x;
2014         int startx = span->startx;
2015         int endx = span->endx;
2016         float wslope = triangle->w[0];
2017         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
2018         float endz = 1.0f / (w + wslope * startx);
2019         if (triangle->w[0] == 0)
2020         {
2021                 // LordHavoc: fast flat polygons (HUD/menu)
2022                 for (x = startx;x < endx;x++)
2023                         zf[x] = endz;
2024                 return;
2025         }
2026         for (x = startx;x < endx;)
2027         {
2028                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2029                 float z = endz, dz;
2030                 if (nextsub >= endx) nextsub = endsub = endx-1;
2031                 endz = 1.0f / (w + wslope * nextsub);
2032                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
2033                 for (; x <= endsub; x++, z += dz)
2034                         zf[x] = z;
2035         }
2036 }
2037
2038 static void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2039 {
2040 #ifdef SSE_POSSIBLE
2041         int x;
2042         int startx = span->startx;
2043         int endx = span->endx;
2044         int maskx;
2045         int subx;
2046         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2047         unsigned char * RESTRICT pixelmask = span->pixelmask;
2048         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2049         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2050         if (!pixel)
2051                 return;
2052         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2053         pixeli += span->y * dpsoftrast.fb_width + span->x;
2054         // handle alphatest now (this affects depth writes too)
2055         if (thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL)
2056                 for (x = startx;x < endx;x++)
2057                         if (in4ub[x*4+3] < 128)
2058                                 pixelmask[x] = false;
2059         // LordHavoc: clear pixelmask for some pixels in alphablend cases, this
2060         // helps sprites, text and hud artwork
2061         switch(thread->fb_blendmode)
2062         {
2063         case DPSOFTRAST_BLENDMODE_ALPHA:
2064         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2065         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2066                 maskx = startx;
2067                 for (x = startx;x < endx;x++)
2068                 {
2069                         if (in4ub[x*4+3] >= 1)
2070                         {
2071                                 startx = x;
2072                                 for (;;)
2073                                 {
2074                                         while (++x < endx && in4ub[x*4+3] >= 1) ;
2075                                         maskx = x;
2076                                         if (x >= endx) break;
2077                                         ++x;
2078                                         while (++x < endx && in4ub[x*4+3] < 1) pixelmask[x] = false;
2079                                         if (x >= endx) break;
2080                                 }
2081                                 break;
2082                         }
2083                 }
2084                 endx = maskx;
2085                 break;
2086         case DPSOFTRAST_BLENDMODE_OPAQUE:
2087         case DPSOFTRAST_BLENDMODE_ADD:
2088         case DPSOFTRAST_BLENDMODE_INVMOD:
2089         case DPSOFTRAST_BLENDMODE_MUL:
2090         case DPSOFTRAST_BLENDMODE_MUL2:
2091         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2092         case DPSOFTRAST_BLENDMODE_INVADD:
2093                 break;
2094         }
2095         // put some special values at the end of the mask to ensure the loops end
2096         pixelmask[endx] = 1;
2097         pixelmask[endx+1] = 0;
2098         // LordHavoc: use a double loop to identify subspans, this helps the
2099         // optimized copy/blend loops to perform at their best, most triangles
2100         // have only one run of pixels, and do the search using wide reads...
2101         x = startx;
2102         while (x < endx)
2103         {
2104                 // if this pixel is masked off, it's probably not alone...
2105                 if (!pixelmask[x])
2106                 {
2107                         x++;
2108 #if 1
2109                         if (x + 8 < endx)
2110                         {
2111                                 // the 4-item search must be aligned or else it stalls badly
2112                                 if ((x & 3) && !pixelmask[x]) 
2113                                 {
2114                                         if(pixelmask[x]) goto endmasked;
2115                                         x++;
2116                                         if (x & 3)
2117                                         {
2118                                                 if(pixelmask[x]) goto endmasked;
2119                                                 x++;
2120                                                 if (x & 3)
2121                                                 {
2122                                                         if(pixelmask[x]) goto endmasked;
2123                                                         x++;
2124                                                 }
2125                                         }
2126                                 }
2127                                 while (*(unsigned int *)&pixelmask[x] == 0x00000000)
2128                                         x += 4;
2129                         }
2130 #endif
2131                         for (;!pixelmask[x];x++)
2132                                 ;
2133                         // rather than continue the loop, just check the end variable
2134                         if (x >= endx)
2135                                 break;
2136                 }
2137         endmasked:
2138                 // find length of subspan
2139                 subx = x + 1;
2140 #if 1
2141                 if (subx + 8 < endx)
2142                 {
2143                         if (subx & 3)
2144                         {
2145                                 if(!pixelmask[subx]) goto endunmasked;
2146                                 subx++;
2147                                 if (subx & 3)
2148                                 {
2149                                         if(!pixelmask[subx]) goto endunmasked;
2150                                         subx++;
2151                                         if (subx & 3)
2152                                         {
2153                                                 if(!pixelmask[subx]) goto endunmasked;
2154                                                 subx++;
2155                                         }
2156                                 }
2157                         }
2158                         while (*(unsigned int *)&pixelmask[subx] == 0x01010101)
2159                                 subx += 4;
2160                 }
2161 #endif
2162                 for (;pixelmask[subx];subx++)
2163                         ;
2164                 // the checks can overshoot, so make sure to clip it...
2165                 if (subx > endx)
2166                         subx = endx;
2167         endunmasked:
2168                 // now that we know the subspan length...  process!
2169                 switch(thread->fb_blendmode)
2170                 {
2171                 case DPSOFTRAST_BLENDMODE_OPAQUE:
2172 #if 0
2173                         if (subx - x >= 16)
2174                         {
2175                                 memcpy(pixeli + x, ini + x, (subx - x) * sizeof(pixeli[x]));
2176                                 x = subx;
2177                         }
2178                         else
2179 #elif 1
2180                         while (x + 16 <= subx)
2181                         {
2182                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2183                                 _mm_storeu_si128((__m128i *)&pixeli[x+4], _mm_loadu_si128((const __m128i *)&ini[x+4]));
2184                                 _mm_storeu_si128((__m128i *)&pixeli[x+8], _mm_loadu_si128((const __m128i *)&ini[x+8]));
2185                                 _mm_storeu_si128((__m128i *)&pixeli[x+12], _mm_loadu_si128((const __m128i *)&ini[x+12]));
2186                                 x += 16;
2187                         }
2188 #endif
2189                         {
2190                                 while (x + 4 <= subx)
2191                                 {
2192                                         _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2193                                         x += 4;
2194                                 }
2195                                 if (x + 2 <= subx)
2196                                 {
2197                                         pixeli[x] = ini[x];
2198                                         pixeli[x+1] = ini[x+1];
2199                                         x += 2;
2200                                 }
2201                                 if (x < subx)
2202                                 {
2203                                         pixeli[x] = ini[x];
2204                                         x++;
2205                                 }
2206                         }
2207                         break;
2208                 case DPSOFTRAST_BLENDMODE_ALPHA:
2209                 #define FINISHBLEND(blend2, blend1) \
2210                         for (;x + 1 < subx;x += 2) \
2211                         { \
2212                                 __m128i src, dst; \
2213                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2214                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2215                                 blend2; \
2216                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2217                         } \
2218                         if (x < subx) \
2219                         { \
2220                                 __m128i src, dst; \
2221                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2222                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2223                                 blend1; \
2224                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2225                                 x++; \
2226                         }
2227                         FINISHBLEND({
2228                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2229                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2230                         }, {
2231                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2232                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2233                         });
2234                         break;
2235                 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2236                         FINISHBLEND({
2237                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2238                                 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2239                         }, {
2240                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2241                                 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2242                         });
2243                         break;
2244                 case DPSOFTRAST_BLENDMODE_ADD:
2245                         FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2246                         break;
2247                 case DPSOFTRAST_BLENDMODE_INVMOD:
2248                         FINISHBLEND({
2249                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2250                         }, {
2251                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2252                         });
2253                         break;
2254                 case DPSOFTRAST_BLENDMODE_MUL:
2255                         FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2256                         break;
2257                 case DPSOFTRAST_BLENDMODE_MUL2:
2258                         FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2259                         break;
2260                 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2261                         FINISHBLEND({
2262                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2263                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2264                         }, {
2265                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2266                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2267                         });
2268                         break;
2269                 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2270                         FINISHBLEND({
2271                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2272                                 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2273                         }, {
2274                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2275                                 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2276                         });
2277                         break;
2278                 case DPSOFTRAST_BLENDMODE_INVADD:
2279                         FINISHBLEND({
2280                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2281                         }, {
2282                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2283                         });
2284                         break;
2285                 }
2286         }
2287 #endif
2288 }
2289
2290 static void DPSOFTRAST_Texture2DBGRA8(DPSOFTRAST_Texture *texture, int mip, float x, float y, unsigned char c[4])
2291         // warning: this is SLOW, only use if the optimized per-span functions won't do
2292 {
2293         const unsigned char * RESTRICT pixelbase;
2294         const unsigned char * RESTRICT pixel[4];
2295         int width = texture->mipmap[mip][2], height = texture->mipmap[mip][3];
2296         int wrapmask[2] = { width-1, height-1 };
2297         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2298         if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
2299         {
2300                 unsigned int tc[2] = { x * (width<<12) - 2048, y * (height<<12) - 2048};
2301                 unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
2302                 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2303                 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2304                 int tci[2] = { tc[0]>>12, tc[1]>>12 };
2305                 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
2306                 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2307                 {
2308                         tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2309                         tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2310                         tci1[0] = tci1[0] >= 0 ? (tci1[0] <= wrapmask[0] ? tci1[0] : wrapmask[0]) : 0;
2311                         tci1[1] = tci1[1] >= 0 ? (tci1[1] <= wrapmask[1] ? tci1[1] : wrapmask[1]) : 0;
2312                 }
2313                 else
2314                 {
2315                         tci[0] &= wrapmask[0];
2316                         tci[1] &= wrapmask[1];
2317                         tci1[0] &= wrapmask[0];
2318                         tci1[1] &= wrapmask[1];
2319                 }
2320                 pixel[0] = pixelbase + 4 * (tci[1]*width+tci[0]);
2321                 pixel[1] = pixelbase + 4 * (tci[1]*width+tci1[0]);
2322                 pixel[2] = pixelbase + 4 * (tci1[1]*width+tci[0]);
2323                 pixel[3] = pixelbase + 4 * (tci1[1]*width+tci1[0]);
2324                 c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
2325                 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
2326                 c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
2327                 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3])>>24;
2328         }
2329         else
2330         {
2331                 int tci[2] = { x * width, y * height };
2332                 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2333                 {
2334                         tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2335                         tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2336                 }
2337                 else
2338                 {
2339                         tci[0] &= wrapmask[0];
2340                         tci[1] &= wrapmask[1];
2341                 }
2342                 pixel[0] = pixelbase + 4 * (tci[1]*width+tci[0]);
2343                 c[0] = pixel[0][0];
2344                 c[1] = pixel[0][1];
2345                 c[2] = pixel[0][2];
2346                 c[3] = pixel[0][3];
2347         }
2348 }
2349
2350 #if 0
2351 static void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2352 {
2353         int x;
2354         int startx = span->startx;
2355         int endx = span->endx;
2356         int flags;
2357         float c[4];
2358         float data[4];
2359         float slope[4];
2360         float tc[2], endtc[2];
2361         float tcscale[2];
2362         unsigned int tci[2];
2363         unsigned int tci1[2];
2364         unsigned int tcimin[2];
2365         unsigned int tcimax[2];
2366         int tciwrapmask[2];
2367         int tciwidth;
2368         int filter;
2369         int mip;
2370         const unsigned char * RESTRICT pixelbase;
2371         const unsigned char * RESTRICT pixel[4];
2372         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2373         // if no texture is bound, just fill it with white
2374         if (!texture)
2375         {
2376                 for (x = startx;x < endx;x++)
2377                 {
2378                         out4f[x*4+0] = 1.0f;
2379                         out4f[x*4+1] = 1.0f;
2380                         out4f[x*4+2] = 1.0f;
2381                         out4f[x*4+3] = 1.0f;
2382                 }
2383                 return;
2384         }
2385         mip = triangle->mip[texunitindex];
2386         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2387         // if this mipmap of the texture is 1 pixel, just fill it with that color
2388         if (texture->mipmap[mip][1] == 4)
2389         {
2390                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2391                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2392                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2393                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2394                 for (x = startx;x < endx;x++)
2395                 {
2396                         out4f[x*4+0] = c[0];
2397                         out4f[x*4+1] = c[1];
2398                         out4f[x*4+2] = c[2];
2399                         out4f[x*4+3] = c[3];
2400                 }
2401                 return;
2402         }
2403         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2404         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2405         flags = texture->flags;
2406         tcscale[0] = texture->mipmap[mip][2];
2407         tcscale[1] = texture->mipmap[mip][3];
2408         tciwidth = texture->mipmap[mip][2];
2409         tcimin[0] = 0;
2410         tcimin[1] = 0;
2411         tcimax[0] = texture->mipmap[mip][2]-1;
2412         tcimax[1] = texture->mipmap[mip][3]-1;
2413         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2414         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2415         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
2416         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
2417         if (filter)
2418         {
2419                 endtc[0] -= 0.5f;
2420                 endtc[1] -= 0.5f;
2421         }
2422         for (x = startx;x < endx;)
2423         {
2424                 unsigned int subtc[2];
2425                 unsigned int substep[2];
2426                 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2427                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2428                 if (nextsub >= endx)
2429                 {
2430                         nextsub = endsub = endx-1;      
2431                         if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2432                 }
2433                 tc[0] = endtc[0];
2434                 tc[1] = endtc[1];
2435                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2436                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2437                 if (filter)
2438                 {
2439                         endtc[0] -= 0.5f;
2440                         endtc[1] -= 0.5f;
2441                 }
2442                 substep[0] = (endtc[0] - tc[0]) * subscale;
2443                 substep[1] = (endtc[1] - tc[1]) * subscale;
2444                 subtc[0] = tc[0] * (1<<12);
2445                 subtc[1] = tc[1] * (1<<12);
2446                 if (filter)
2447                 {
2448                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2449                         {
2450                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2451                                 {
2452                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2453                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2454                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2455                                         tci[0] = subtc[0]>>12;
2456                                         tci[1] = subtc[1]>>12;
2457                                         tci1[0] = tci[0] + 1;
2458                                         tci1[1] = tci[1] + 1;
2459                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2460                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2461                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2462                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2463                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2464                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2465                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2466                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2467                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2468                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2469                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2470                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2471                                         out4f[x*4+0] = c[0];
2472                                         out4f[x*4+1] = c[1];
2473                                         out4f[x*4+2] = c[2];
2474                                         out4f[x*4+3] = c[3];
2475                                 }
2476                         }
2477                         else
2478                         {
2479                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2480                                 {
2481                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2482                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2483                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2484                                         tci[0] = subtc[0]>>12;
2485                                         tci[1] = subtc[1]>>12;
2486                                         tci1[0] = tci[0] + 1;
2487                                         tci1[1] = tci[1] + 1;
2488                                         tci[0] &= tciwrapmask[0];
2489                                         tci[1] &= tciwrapmask[1];
2490                                         tci1[0] &= tciwrapmask[0];
2491                                         tci1[1] &= tciwrapmask[1];
2492                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2493                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2494                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2495                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2496                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2497                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2498                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2499                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2500                                         out4f[x*4+0] = c[0];
2501                                         out4f[x*4+1] = c[1];
2502                                         out4f[x*4+2] = c[2];
2503                                         out4f[x*4+3] = c[3];
2504                                 }
2505                         }
2506                 }
2507                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2508                 {
2509                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2510                         {
2511                                 tci[0] = subtc[0]>>12;
2512                                 tci[1] = subtc[1]>>12;
2513                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2514                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2515                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2516                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2517                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2518                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2519                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2520                                 out4f[x*4+0] = c[0];
2521                                 out4f[x*4+1] = c[1];
2522                                 out4f[x*4+2] = c[2];
2523                                 out4f[x*4+3] = c[3];
2524                         }
2525                 }
2526                 else
2527                 {
2528                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2529                         {
2530                                 tci[0] = subtc[0]>>12;
2531                                 tci[1] = subtc[1]>>12;
2532                                 tci[0] &= tciwrapmask[0];
2533                                 tci[1] &= tciwrapmask[1];
2534                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2535                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2536                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2537                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2538                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2539                                 out4f[x*4+0] = c[0];
2540                                 out4f[x*4+1] = c[1];
2541                                 out4f[x*4+2] = c[2];
2542                                 out4f[x*4+3] = c[3];
2543                         }
2544                 }
2545         }
2546 }
2547 #endif
2548
2549 static void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2550 {
2551 #ifdef SSE_POSSIBLE
2552         int x;
2553         int startx = span->startx;
2554         int endx = span->endx;
2555         int flags;
2556         __m128 data, slope, tcscale;
2557         __m128i tcsize, tcmask, tcoffset, tcmax;
2558         __m128 tc, endtc;
2559         __m128i subtc, substep, endsubtc;
2560         int filter;
2561         int mip;
2562         int affine; // LordHavoc: optimized affine texturing case
2563         unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2564         const unsigned char * RESTRICT pixelbase;
2565         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2566         // if no texture is bound, just fill it with white
2567         if (!texture)
2568         {
2569                 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2570                 return;
2571         }
2572         mip = triangle->mip[texunitindex];
2573         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2574         // if this mipmap of the texture is 1 pixel, just fill it with that color
2575         if (texture->mipmap[mip][1] == 4)
2576         {
2577                 unsigned int k = *((const unsigned int *)pixelbase);
2578                 for (x = startx;x < endx;x++)
2579                         outi[x] = k;
2580                 return;
2581         }
2582         affine = zf[startx] == zf[endx-1];
2583         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2584         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2585         flags = texture->flags;
2586         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2587         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2588         tcscale = _mm_cvtepi32_ps(tcsize);
2589         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2590         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2591         endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2592         if (filter)
2593                 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2594         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2595         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2596         tcmax = _mm_packs_epi32(tcmask, tcmask);
2597         for (x = startx;x < endx;)
2598         {
2599                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2600                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2601                 if (nextsub >= endx || affine)
2602                 {
2603                         nextsub = endsub = endx-1;
2604                         if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2605                 }       
2606                 tc = endtc;
2607                 subtc = endsubtc;
2608                 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2609                 if (filter)
2610                         endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2611                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2612                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2613                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2614                 substep = _mm_slli_epi32(substep, 1);
2615                 if (filter)
2616                 {
2617                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2618                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2619                         {
2620                                 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2621                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2622                                 {
2623                                         const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;                   
2624                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2625                                         tci = _mm_madd_epi16(tci, tcoffset);
2626                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2627                                         ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2628                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2629                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2630                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2631                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2632                                         fracm = _mm_srli_epi16(subtc, 1);
2633                                         pix1 = _mm_add_epi16(pix1,
2634                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2635                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2636                                         pix3 = _mm_add_epi16(pix3,
2637                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2638                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2639                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2640                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2641                                         pix2 = _mm_add_epi16(pix2,
2642                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2643                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2644                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2645                                 }
2646                                 if (x <= endsub)
2647                                 {
2648                                         const unsigned char * RESTRICT ptr1;
2649                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2650                                         tci = _mm_madd_epi16(tci, tcoffset);
2651                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2652                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2653                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2654                                         fracm = _mm_srli_epi16(subtc, 1);
2655                                         pix1 = _mm_add_epi16(pix1,
2656                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2657                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2658                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2659                                         pix1 = _mm_add_epi16(pix1,
2660                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2661                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2662                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2663                                         x++;
2664                                 }
2665                         }
2666                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2667                         {
2668                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2669                                 {
2670                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2671                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2672                                         tci = _mm_madd_epi16(tci, tcoffset);
2673                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2674                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2675                                                                                         _mm_setzero_si128());
2676                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2677                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2678                                                                                         _mm_setzero_si128());
2679                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2680                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2681                                         tci = _mm_madd_epi16(tci, tcoffset);
2682                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2683                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2684                                                                                         _mm_setzero_si128());
2685                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2686                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2687                                                                                         _mm_setzero_si128());
2688                                         fracm = _mm_srli_epi16(subtc, 1);
2689                                         pix1 = _mm_add_epi16(pix1,
2690                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2691                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2692                                         pix3 = _mm_add_epi16(pix3,
2693                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2694                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2695                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2696                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2697                                         pix2 = _mm_add_epi16(pix2,
2698                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2699                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2700                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2701                                 }
2702                                 if (x <= endsub)
2703                                 {
2704                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2705                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2706                                         tci = _mm_madd_epi16(tci, tcoffset);
2707                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2708                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2709                                                                                         _mm_setzero_si128());
2710                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2711                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2712                                                                                         _mm_setzero_si128());
2713                                         fracm = _mm_srli_epi16(subtc, 1);
2714                                         pix1 = _mm_add_epi16(pix1,
2715                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2716                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2717                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2718                                         pix1 = _mm_add_epi16(pix1,
2719                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2720                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2721                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2722                                         x++;
2723                                 }
2724                         }
2725                         else
2726                         {
2727                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2728                                 {
2729                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2730                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2731                                         tci = _mm_madd_epi16(tci, tcoffset);
2732                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2733                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2734                                                                                         _mm_setzero_si128());
2735                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2736                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2737                                                                                         _mm_setzero_si128());
2738                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2739                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2740                                         tci = _mm_madd_epi16(tci, tcoffset);
2741                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2742                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2743                                                                                         _mm_setzero_si128());
2744                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2745                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2746                                                                                         _mm_setzero_si128());
2747                                         fracm = _mm_srli_epi16(subtc, 1);
2748                                         pix1 = _mm_add_epi16(pix1,
2749                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2750                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2751                                         pix3 = _mm_add_epi16(pix3,
2752                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2753                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2754                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2755                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2756                                         pix2 = _mm_add_epi16(pix2,
2757                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2758                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2759                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2760                                 }
2761                                 if (x <= endsub)
2762                                 {
2763                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2764                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2765                                         tci = _mm_madd_epi16(tci, tcoffset);
2766                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2767                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2768                                                                                         _mm_setzero_si128());
2769                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2770                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2771                                                                                         _mm_setzero_si128());
2772                                         fracm = _mm_srli_epi16(subtc, 1);
2773                                         pix1 = _mm_add_epi16(pix1,
2774                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2775                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2776                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2777                                         pix1 = _mm_add_epi16(pix1,
2778                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2779                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2780                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2781                                         x++;
2782                                 }
2783                         }
2784                 }
2785                 else
2786                 {
2787                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2788                         {
2789                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2790                                 {
2791                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2792                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2793                                         tci = _mm_madd_epi16(tci, tcoffset);
2794                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2795                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2796                                 }
2797                                 if (x <= endsub)
2798                                 {
2799                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2800                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2801                                         tci = _mm_madd_epi16(tci, tcoffset);
2802                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2803                                         x++;
2804                                 }
2805                         }
2806                         else
2807                         {
2808                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2809                                 {
2810                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2811                                         tci = _mm_and_si128(tci, tcmax); 
2812                                         tci = _mm_madd_epi16(tci, tcoffset);
2813                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2814                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2815                                 }
2816                                 if (x <= endsub)
2817                                 {
2818                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2819                                         tci = _mm_and_si128(tci, tcmax); 
2820                                         tci = _mm_madd_epi16(tci, tcoffset);
2821                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2822                                         x++;
2823                                 }
2824                         }
2825                 }
2826         }
2827 #endif
2828 }
2829
2830 static void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2831 {
2832         // TODO: IMPLEMENT
2833         memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2834 }
2835
2836 static float DPSOFTRAST_SampleShadowmap(const float *vector)
2837 {
2838         // TODO: IMPLEMENT
2839         return 1.0f;
2840 }
2841
2842 #if 0
2843 static void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2844 {
2845         int x;
2846         int startx = span->startx;
2847         int endx = span->endx;
2848         float c[4];
2849         float data[4];
2850         float slope[4];
2851         float z;
2852         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2853         for (x = startx;x < endx;x++)
2854         {
2855                 z = zf[x];
2856                 c[0] = (data[0] + slope[0]*x) * z;
2857                 c[1] = (data[1] + slope[1]*x) * z;
2858                 c[2] = (data[2] + slope[2]*x) * z;
2859                 c[3] = (data[3] + slope[3]*x) * z;
2860                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2861                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2862                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2863                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2864         }
2865 }
2866 #endif
2867
2868 #if 0
2869 static void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2870 {
2871         int x;
2872         int startx = span->startx;
2873         int endx = span->endx;
2874         float c[4];
2875         float data[4];
2876         float slope[4];
2877         float z;
2878         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2879         for (x = startx;x < endx;x++)
2880         {
2881                 z = zf[x];
2882                 c[0] = (data[0] + slope[0]*x) * z;
2883                 c[1] = (data[1] + slope[1]*x) * z;
2884                 c[2] = (data[2] + slope[2]*x) * z;
2885                 c[3] = (data[3] + slope[3]*x) * z;
2886                 out4f[x*4+0] = c[0];
2887                 out4f[x*4+1] = c[1];
2888                 out4f[x*4+2] = c[2];
2889                 out4f[x*4+3] = c[3];
2890         }
2891 }
2892 #endif
2893
2894 #if 0
2895 static void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2896 {
2897         int x, startx = span->startx, endx = span->endx;
2898         float c[4], localcolor[4];
2899         localcolor[0] = subcolor[0];
2900         localcolor[1] = subcolor[1];
2901         localcolor[2] = subcolor[2];
2902         localcolor[3] = subcolor[3];
2903         for (x = startx;x < endx;x++)
2904         {
2905                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2906                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2907                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2908                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2909                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2910                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2911                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2912                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2913         }
2914 }
2915 #endif
2916
2917 #if 0
2918 static void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2919 {
2920         int x, startx = span->startx, endx = span->endx;
2921         for (x = startx;x < endx;x++)
2922         {
2923                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2924                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2925                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2926                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2927         }
2928 }
2929 #endif
2930
2931 #if 0
2932 static void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2933 {
2934         int x, startx = span->startx, endx = span->endx;
2935         for (x = startx;x < endx;x++)
2936         {
2937                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2938                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2939                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2940                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2941         }
2942 }
2943 #endif
2944
2945 #if 0
2946 static void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2947 {
2948         int x, startx = span->startx, endx = span->endx;
2949         float a, b;
2950         for (x = startx;x < endx;x++)
2951         {
2952                 a = 1.0f - inb4f[x*4+3];
2953                 b = inb4f[x*4+3];
2954                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2955                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2956                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2957                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2958         }
2959 }
2960 #endif
2961
2962 #if 0
2963 static void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2964 {
2965         int x, startx = span->startx, endx = span->endx;
2966         float localcolor[4], ilerp, lerp;
2967         localcolor[0] = color[0];
2968         localcolor[1] = color[1];
2969         localcolor[2] = color[2];
2970         localcolor[3] = color[3];
2971         ilerp = 1.0f - localcolor[3];
2972         lerp = localcolor[3];
2973         for (x = startx;x < endx;x++)
2974         {
2975                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2976                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2977                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2978                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2979         }
2980 }
2981 #endif
2982
2983
2984
2985 static void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2986 {
2987 #ifdef SSE_POSSIBLE
2988         int x;
2989         int startx = span->startx;
2990         int endx = span->endx;
2991         __m128 data, slope;
2992         __m128 mod, endmod;
2993         __m128i submod, substep, endsubmod;
2994         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2995         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2996         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2997         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2998         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2999         for (x = startx; x < endx;)
3000         {
3001                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3002                 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3003                 if (nextsub >= endx)
3004                 {
3005                         nextsub = endsub = endx-1;
3006                         if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
3007                 }
3008                 mod = endmod;
3009                 submod = endsubmod;
3010                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3011                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3012                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3013                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3014                 substep = _mm_packs_epi32(substep, substep);
3015                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3016                 {
3017                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
3018                         pix = _mm_mulhi_epu16(pix, submod);
3019                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3020                 }
3021                 if (x <= endsub)
3022                 {
3023                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
3024                         pix = _mm_mulhi_epu16(pix, submod);
3025                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3026                         x++;
3027                 }
3028         }
3029 #endif
3030 }
3031
3032 static void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
3033 {
3034 #ifdef SSE_POSSIBLE
3035         int x;
3036         int startx = span->startx;
3037         int endx = span->endx;
3038         __m128 data, slope;
3039         __m128 mod, endmod;
3040         __m128i submod, substep, endsubmod;
3041         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3042         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3043         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3044         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3045         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3046         for (x = startx; x < endx;)
3047         {
3048                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3049                 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3050                 if (nextsub >= endx)
3051                 {
3052                         nextsub = endsub = endx-1;
3053                         if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
3054                 }
3055                 mod = endmod;
3056                 submod = endsubmod;
3057                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3058                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3059                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3060                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3061                 substep = _mm_packs_epi32(substep, substep);
3062                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3063                 {
3064                         __m128i pix = _mm_srai_epi16(submod, 4);
3065                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3066                 }
3067                 if (x <= endsub)
3068                 {
3069                         __m128i pix = _mm_srai_epi16(submod, 4);
3070                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3071                         x++;
3072                 }
3073         }
3074 #endif
3075 }
3076
3077 static void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3078 {
3079 #ifdef SSE_POSSIBLE
3080         int x, startx = span->startx, endx = span->endx;
3081         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3082         localcolor = _mm_packs_epi32(localcolor, localcolor);
3083         for (x = startx;x+2 <= endx;x+=2)
3084         {
3085                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3086                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3087                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3088                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3089         }
3090         if (x < endx)
3091         {
3092                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3093                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3094                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3095                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3096         }
3097 #endif
3098 }
3099
3100 static void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3101 {
3102 #ifdef SSE_POSSIBLE
3103         int x, startx = span->startx, endx = span->endx;
3104         for (x = startx;x+2 <= endx;x+=2)
3105         {
3106                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3107                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3108                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3109                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3110         }
3111         if (x < endx)
3112         {
3113                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3114                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3115                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3116                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3117         }
3118 #endif
3119 }
3120
3121 static void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3122 {
3123 #ifdef SSE_POSSIBLE
3124         int x, startx = span->startx, endx = span->endx;
3125         for (x = startx;x+2 <= endx;x+=2)
3126         {
3127                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3128                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3129                 pix1 = _mm_add_epi16(pix1, pix2);
3130                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3131         }
3132         if (x < endx)
3133         {
3134                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3135                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3136                 pix1 = _mm_add_epi16(pix1, pix2);
3137                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3138         }
3139 #endif
3140 }
3141
3142 #if 0
3143 static void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3144 {
3145 #ifdef SSE_POSSIBLE
3146         int x, startx = span->startx, endx = span->endx;
3147         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3148         tint = _mm_packs_epi32(tint, tint);
3149         for (x = startx;x+2 <= endx;x+=2)
3150         {
3151                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3152                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3153                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3154                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3155         }
3156         if (x < endx)
3157         {
3158                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3159                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3160                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3161                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3162         }
3163 #endif
3164 }
3165 #endif
3166
3167 static void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3168 {
3169 #ifdef SSE_POSSIBLE
3170         int x, startx = span->startx, endx = span->endx;
3171         for (x = startx;x+2 <= endx;x+=2)
3172         {
3173                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3174                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3175                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3176                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3177                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3178         }
3179         if (x < endx)
3180         {
3181                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3182                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3183                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3184                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3185                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3186         }
3187 #endif
3188 }
3189
3190 static void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3191 {
3192 #ifdef SSE_POSSIBLE
3193         int x, startx = span->startx, endx = span->endx;
3194         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3195         localcolor = _mm_packs_epi32(localcolor, localcolor);
3196         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3197         for (x = startx;x+2 <= endx;x+=2)
3198         {
3199                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3200                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3201                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3202         }
3203         if (x < endx)
3204         {
3205                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3206                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3207                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3208         }
3209 #endif
3210 }
3211
3212
3213
3214 static void DPSOFTRAST_VertexShader_Generic(void)
3215 {
3216         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3217         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3218         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3219         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3220                 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3221 }
3222
3223 static void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3224 {
3225         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3226         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3227         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3228         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3229         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3230         if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3231         {
3232                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3233                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3234                 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3235                 {
3236                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3237                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3238                         {
3239                                 // multiply
3240                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3241                         }
3242                         else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3243                         {
3244                                 // add
3245                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3246                         }
3247                         else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3248                         {
3249                                 // alphablend
3250                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3251                         }
3252                 }
3253         }
3254         else
3255                 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3256         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3257 }
3258
3259
3260
3261 static void DPSOFTRAST_VertexShader_PostProcess(void)
3262 {
3263         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3264         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3265         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3266 }
3267
3268 static void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3269 {
3270         // TODO: optimize!!  at the very least there is no reason to use texture sampling on the frame texture
3271         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3272         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3273         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3274         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3275         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3276         if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3277         {
3278                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3279                 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3280         }
3281         DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3282         if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3283         {
3284                 // TODO: implement saturation
3285         }
3286         if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3287         {
3288                 // TODO: implement gammaramps
3289         }
3290         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3291 }
3292
3293
3294
3295 static void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3296 {
3297         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3298 }
3299
3300 static void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3301 {
3302         // this is never called (because colormask is off when this shader is used)
3303         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3304         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3305         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3306         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3307         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3308 }
3309
3310
3311
3312 static void DPSOFTRAST_VertexShader_FlatColor(void)
3313 {
3314         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3315         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3316 }
3317
3318 static void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3319 {
3320 #ifdef SSE_POSSIBLE
3321         unsigned char * RESTRICT pixelmask = span->pixelmask;
3322         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3323         int x, startx = span->startx, endx = span->endx;
3324         __m128i Color_Ambientm;
3325         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3326         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3327         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3328         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3329         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3330         if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3331                 pixel = buffer_FragColorbgra8;
3332         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3333         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3334         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3335         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3336         for (x = startx;x < endx;x++)
3337         {
3338                 __m128i color, pix;
3339                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3340                 {
3341                         __m128i pix2;
3342                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3343                         pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3344                         pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3345                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3346                         x += 3;
3347                         continue;
3348                 }
3349                 if (!pixelmask[x])
3350                         continue;
3351                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3352                 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3353                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3354         }
3355         if (pixel == buffer_FragColorbgra8)
3356                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3357 #endif
3358 }
3359
3360
3361
3362 static void DPSOFTRAST_VertexShader_VertexColor(void)
3363 {
3364         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3365         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3366         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3367 }
3368
3369 static void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3370 {
3371 #ifdef SSE_POSSIBLE
3372         unsigned char * RESTRICT pixelmask = span->pixelmask;
3373         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3374         int x, startx = span->startx, endx = span->endx;
3375         __m128i Color_Ambientm, Color_Diffusem;
3376         __m128 data, slope;
3377         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3378         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3379         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3380         int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3381         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3382         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3383         if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3384                 pixel = buffer_FragColorbgra8;
3385         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3386         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3387         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3388         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3389         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3390         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3391         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3392         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3393         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3394         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3395         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3396         data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3397         slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3398         for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3399         {
3400                 __m128i color, mod, pix;
3401                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3402                 {
3403                         __m128i pix2, mod2;
3404                         __m128 z = _mm_loadu_ps(&buffer_z[x]);
3405                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3406                         mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3407                         data = _mm_add_ps(data, slope);
3408                         mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3409                         data = _mm_add_ps(data, slope);
3410                         mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3411                         data = _mm_add_ps(data, slope);
3412                         mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3413                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3414                                                                   _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3415                         pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3416                                                                    _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3417                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3418                         x += 3;
3419                         continue;
3420                 }
3421                 if (!pixelmask[x])
3422                         continue;
3423                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3424                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
3425                 mod = _mm_packs_epi32(mod, mod);
3426                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3427                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3428         }
3429         if (pixel == buffer_FragColorbgra8)
3430                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3431 #endif
3432 }
3433
3434
3435
3436 static void DPSOFTRAST_VertexShader_Lightmap(void)
3437 {
3438         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3439         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3440         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3441 }
3442
3443 static void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3444 {
3445 #ifdef SSE_POSSIBLE
3446         unsigned char * RESTRICT pixelmask = span->pixelmask;
3447         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3448         int x, startx = span->startx, endx = span->endx;
3449         __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3450         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3451         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3452         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3453         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3454         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3455         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3456         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3457         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3458         if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3459                 pixel = buffer_FragColorbgra8;
3460         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3461         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3462         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3463         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3464         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3465         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3466         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3467         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3468         {
3469                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3470                 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3471                 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3472                 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3473                 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3474                 for (x = startx;x < endx;x++)
3475                 {
3476                         __m128i color, lightmap, glow, pix;
3477                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3478                         {
3479                                 __m128i pix2;
3480                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3481                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3482                                 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3483                                 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3484                                                                                                         _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3485                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3486                                 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3487                                                                                                         _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3488                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3489                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3490                                 x += 3;
3491                                 continue;
3492                         }
3493                         if (!pixelmask[x])
3494                                 continue;
3495                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3496                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3497                         glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3498                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3499                         pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3500                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3501                 }
3502         }
3503         else
3504         {
3505                 for (x = startx;x < endx;x++)
3506                 {
3507                         __m128i color, lightmap, pix;
3508                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3509                         {
3510                                 __m128i pix2;
3511                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3512                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3513                                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3514                                                                           _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3515                                 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3516                                                                            _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3517                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3518                                 x += 3;
3519                                 continue;
3520                         }
3521                         if (!pixelmask[x]) 
3522                                 continue;
3523                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3524                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3525                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3526                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3527                 }
3528         }
3529         if (pixel == buffer_FragColorbgra8)
3530                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3531 #endif
3532 }
3533
3534
3535 void DPSOFTRAST_VertexShader_LightDirection(void);
3536 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3537
3538 static void DPSOFTRAST_VertexShader_FakeLight(void)
3539 {
3540         DPSOFTRAST_VertexShader_LightDirection();
3541 }
3542
3543 static void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3544 {
3545         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3546 }
3547
3548
3549
3550 static void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3551 {
3552         DPSOFTRAST_VertexShader_LightDirection();
3553         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3554 }
3555
3556 static void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3557 {
3558         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3559 }
3560
3561
3562
3563 static void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3564 {
3565         DPSOFTRAST_VertexShader_LightDirection();
3566         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3567 }
3568
3569 static void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3570 {
3571         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3572 }
3573
3574
3575
3576 void DPSOFTRAST_VertexShader_LightDirection(void)
3577 {
3578         int i;
3579         int numvertices = dpsoftrast.numvertices;
3580         float LightDir[4];
3581         float LightVector[4];
3582         float EyePosition[4];
3583         float EyeVectorModelSpace[4];
3584         float EyeVector[4];
3585         float position[4];
3586         float svector[4];
3587         float tvector[4];
3588         float normal[4];
3589         LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3590         LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3591         LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3592         LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3593         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3594         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3595         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3596         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3597         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3598         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3599         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3600         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3601         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3602         for (i = 0;i < numvertices;i++)
3603         {
3604                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3605                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3606                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3607                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3608                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3609                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3610                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3611                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3612                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3613                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3614                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3615                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3616                 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3617                 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3618                 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3619                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3620                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3621                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3622                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3623                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3624                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3625                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3626                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3627                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3628                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3629                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3630                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3631                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3632                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3633         }
3634         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3635 }
3636
3637 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3638 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3639 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3640 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3641 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3642 #define DPSOFTRAST_Vector3Normalize(v)\
3643 do\
3644 {\
3645         float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3646         if (len)\
3647         {\
3648                 len = 1.0f / len;\
3649                 v[0] *= len;\
3650                 v[1] *= len;\
3651                 v[2] *= len;\
3652         }\
3653 }\
3654 while(0)
3655
3656 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3657 {
3658         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3659         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3660         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3661         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3662         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3663         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3664         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3665         unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3666         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3667         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3668         int x, startx = span->startx, endx = span->endx;
3669         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3670         float LightVectordata[4];
3671         float LightVectorslope[4];
3672         float EyeVectordata[4];
3673         float EyeVectorslope[4];
3674         float VectorSdata[4];
3675         float VectorSslope[4];
3676         float VectorTdata[4];
3677         float VectorTslope[4];
3678         float VectorRdata[4];
3679         float VectorRslope[4];
3680         float z;
3681         float diffusetex[4];
3682         float glosstex[4];
3683         float surfacenormal[4];
3684         float lightnormal[4];
3685         float lightnormal_modelspace[4];
3686         float eyenormal[4];
3687         float specularnormal[4];
3688         float diffuse;
3689         float specular;
3690         float SpecularPower;
3691         int d[4];
3692         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3693         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3694         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3695         Color_Glow[3] = 0.0f;
3696         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3697         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3698         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3699         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3700         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3701         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3702         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3703         Color_Pants[3] = 0.0f;
3704         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3705         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3706         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3707         Color_Shirt[3] = 0.0f;
3708         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3709         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3710         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3711         {
3712                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3713                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3714         }
3715         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3716         {
3717                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3718         }
3719         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3720         {
3721                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3722                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3723                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3724                 Color_Diffuse[3] = 0.0f;
3725                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3726                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3727                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3728                 LightColor[3] = 0.0f;
3729                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3730                 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3731                 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3732                 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3733                 Color_Specular[3] = 0.0f;
3734                 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3735                 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3736                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3737
3738                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3739                 {
3740                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3741                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3742                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3743                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3744                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3745                 }
3746                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3747                 {
3748                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3749                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3750                 }
3751                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3752                 {
3753                         // nothing of this needed
3754                 }
3755                 else
3756                 {
3757                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3758                 }
3759
3760                 for (x = startx;x < endx;x++)
3761                 {
3762                         z = buffer_z[x];
3763                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3764                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3765                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3766                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3767                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3768                         {
3769                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3770                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3771                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3772                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3773                         }
3774                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3775                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3776                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3777                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3778                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3779                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3780                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3781                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3782
3783                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3784                         {
3785                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3786                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3787                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3788                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3789
3790                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3791                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3792                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3793                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3794
3795                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3796                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3797                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3798                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3799
3800                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3801                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3802                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3803                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3804
3805                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3806                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3807
3808                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3809                                 {
3810                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3811                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3812                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3813                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3814                                 }
3815                         }
3816                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3817                         {
3818                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3819                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3820                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3821                                 {
3822                                         float f = 1.0f / 256.0f;
3823                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3824                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3825                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3826                                 }
3827                         }
3828                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3829                         {
3830                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3831                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3832                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3833                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3834
3835                                 LightColor[0] = 1.0;
3836                                 LightColor[1] = 1.0;
3837                                 LightColor[2] = 1.0;
3838                         }
3839                         else
3840                         {
3841                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3842                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3843                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3844                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3845                         }
3846
3847                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3848
3849                         if(thread->shader_exactspecularmath)
3850                         {
3851                                 // reflect lightnormal at surfacenormal, take the negative of that
3852                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3853                                 float f;
3854                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3855                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3856                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3857                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3858
3859                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
3860                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3861                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3862                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3863                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3864
3865                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3866                         }
3867                         else
3868                         {
3869                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3870                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3871                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3872                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3873
3874                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
3875                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
3876                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
3877                                 DPSOFTRAST_Vector3Normalize(specularnormal);
3878
3879                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3880                         }
3881                         specular = pow(specular, 1.0f + SpecularPower * glosstex[3]);
3882
3883                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3884                         {
3885                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3886                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3887                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3888                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3889                         }
3890                         else
3891                         {
3892                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3893                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3894                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3895                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3896                         }
3897
3898                         buffer_FragColorbgra8[x*4+0] = d[0];
3899                         buffer_FragColorbgra8[x*4+1] = d[1];
3900                         buffer_FragColorbgra8[x*4+2] = d[2];
3901                         buffer_FragColorbgra8[x*4+3] = d[3];
3902                 }
3903         }
3904         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3905         {
3906                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3907                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3908                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3909                 Color_Diffuse[3] = 0.0f;
3910                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3911                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3912                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3913                 LightColor[3] = 0.0f;
3914                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3915
3916                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3917                 {
3918                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3919                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3920                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3921                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3922                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3923                 }
3924                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3925                 {
3926                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3927                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3928                 }
3929                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3930                 {
3931                         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3932                 }
3933                 else
3934                 {
3935                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3936                 }
3937
3938                 for (x = startx;x < endx;x++)
3939                 {
3940                         z = buffer_z[x];
3941                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3942                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3943                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3944                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3945                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3946                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3947                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3948                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3949
3950                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3951                         {
3952                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3953                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3954                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3955                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3956
3957                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3958                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3959                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3960                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3961
3962                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3963                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3964                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3965                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3966
3967                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3968                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3969                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3970                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3971
3972                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3973                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3974
3975                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3976                                 {
3977                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3978                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3979                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3980                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3981                                 }
3982                         }
3983                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3984                         {
3985                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3986                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3987                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3988                                 {
3989                                         float f = 1.0f / 256.0f;
3990                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3991                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3992                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3993                                 }
3994                         }
3995                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3996                         {
3997                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3998                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3999                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4000                                 DPSOFTRAST_Vector3Normalize(lightnormal);
4001
4002                                 LightColor[0] = 1.0;
4003                                 LightColor[1] = 1.0;
4004                                 LightColor[2] = 1.0;
4005                         }
4006                         else
4007                         {
4008                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4009                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4010                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4011                                 DPSOFTRAST_Vector3Normalize(lightnormal);
4012                         }
4013
4014                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4015                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4016                         {
4017                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4018                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4019                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4020                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
4021                         }
4022                         else
4023                         {
4024                                 d[0] = (int)(                                                + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4025                                 d[1] = (int)(                                                + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4026                                 d[2] = (int)(                                                + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4027                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
4028                         }
4029                         buffer_FragColorbgra8[x*4+0] = d[0];
4030                         buffer_FragColorbgra8[x*4+1] = d[1];
4031                         buffer_FragColorbgra8[x*4+2] = d[2];
4032                         buffer_FragColorbgra8[x*4+3] = d[3];
4033                 }
4034         }
4035         else
4036         {
4037                 for (x = startx;x < endx;x++)
4038                 {
4039                         z = buffer_z[x];
4040                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4041                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4042                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4043                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4044
4045                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4046                         {
4047                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4048                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4049                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4050                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4051                         }
4052                         else
4053                         {
4054                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4055                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4056                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4057                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4058                         }
4059                         buffer_FragColorbgra8[x*4+0] = d[0];
4060                         buffer_FragColorbgra8[x*4+1] = d[1];
4061                         buffer_FragColorbgra8[x*4+2] = d[2];
4062                         buffer_FragColorbgra8[x*4+3] = d[3];
4063                 }
4064         }
4065         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4066 }
4067
4068
4069
4070 static void DPSOFTRAST_VertexShader_LightSource(void)
4071 {
4072         int i;
4073         int numvertices = dpsoftrast.numvertices;
4074         float LightPosition[4];
4075         float LightVector[4];
4076         float LightVectorModelSpace[4];
4077         float EyePosition[4];
4078         float EyeVectorModelSpace[4];
4079         float EyeVector[4];
4080         float position[4];
4081         float svector[4];
4082         float tvector[4];
4083         float normal[4];
4084         LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4085         LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4086         LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4087         LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4088         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4089         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4090         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4091         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4092         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4093         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4094         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4095         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4096         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4097         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4098         for (i = 0;i < numvertices;i++)
4099         {
4100                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4101                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4102                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4103                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4104                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4105                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4106                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4107                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4108                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4109                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4110                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4111                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4112                 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4113                 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4114                 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4115                 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4116                 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4117                 LightVector[2] = normal[0]  * LightVectorModelSpace[0] + normal[1]  * LightVectorModelSpace[1] + normal[2]  * LightVectorModelSpace[2];
4118                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4119                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4120                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4121                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4122                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4123                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4124                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4125                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4126                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4127                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
4128                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4129                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4130                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4131                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4132         }
4133         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4134         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4135 }
4136
4137 static void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4138 {
4139 #ifdef SSE_POSSIBLE
4140         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4141         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4142         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4143         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4144         unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4145         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4146         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4147         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4148         int x, startx = span->startx, endx = span->endx;
4149         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4150         float CubeVectordata[4];
4151         float CubeVectorslope[4];
4152         float LightVectordata[4];
4153         float LightVectorslope[4];
4154         float EyeVectordata[4];
4155         float EyeVectorslope[4];
4156         float z;
4157         float diffusetex[4];
4158         float glosstex[4];
4159         float surfacenormal[4];
4160         float lightnormal[4];
4161         float eyenormal[4];
4162         float specularnormal[4];
4163         float diffuse;
4164         float specular;
4165         float SpecularPower;
4166         float CubeVector[4];
4167         float attenuation;
4168         int d[4];
4169         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4170         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4171         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4172         Color_Glow[3] = 0.0f;
4173         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4174         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4175         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4176         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4177         Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4178         Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4179         Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4180         Color_Diffuse[3] = 0.0f;
4181         Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4182         Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4183         Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4184         Color_Specular[3] = 0.0f;
4185         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4186         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4187         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4188         Color_Pants[3] = 0.0f;
4189         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4190         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4191         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4192         Color_Shirt[3] = 0.0f;
4193         LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4194         LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4195         LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4196         LightColor[3] = 0.0f;
4197         SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4198         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4199         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4200         DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4201         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4202         memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4203         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4204         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4205         {
4206                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4207                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4208         }
4209         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4210                 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4211         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4212         {
4213                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4214                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4215                 for (x = startx;x < endx;x++)
4216                 {
4217                         z = buffer_z[x];
4218                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4219                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4220                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4221                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4222                         if (attenuation < 0.01f)
4223                                 continue;
4224                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4225                         {
4226                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4227                                 if (attenuation < 0.01f)
4228                                         continue;
4229                         }
4230
4231                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4232                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4233                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4234                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4235                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4236                         {
4237                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4238                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4239                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4240                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4241                         }
4242                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4243                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4244                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4245                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4246                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4247                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4248                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4249                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4250
4251                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4252                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4253                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4254                         DPSOFTRAST_Vector3Normalize(lightnormal);
4255
4256                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4257
4258                         if(thread->shader_exactspecularmath)
4259                         {
4260                                 // reflect lightnormal at surfacenormal, take the negative of that
4261                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4262                                 float f;
4263                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4264                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4265                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4266                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4267
4268                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
4269                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4270                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4271                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4272                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4273
4274                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4275                         }
4276                         else
4277                         {
4278                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4279                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4280                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4281                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4282
4283                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
4284                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
4285                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
4286                                 DPSOFTRAST_Vector3Normalize(specularnormal);
4287
4288                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4289                         }
4290                         specular = pow(specular, 1.0f + SpecularPower * glosstex[3]);
4291
4292                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4293                         {
4294                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4295                                 attenuation *= (1.0f / 255.0f);
4296                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4297                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4298                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4299                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4300                         }
4301                         else
4302                         {
4303                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4304                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4305                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4306                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4307                         }
4308                         buffer_FragColorbgra8[x*4+0] = d[0];
4309                         buffer_FragColorbgra8[x*4+1] = d[1];
4310                         buffer_FragColorbgra8[x*4+2] = d[2];
4311                         buffer_FragColorbgra8[x*4+3] = d[3];
4312                 }
4313         }
4314         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4315         {
4316                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4317                 for (x = startx;x < endx;x++)
4318                 {
4319                         z = buffer_z[x];
4320                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4321                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4322                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4323                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4324                         if (attenuation < 0.01f)
4325                                 continue;
4326                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4327                         {
4328                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4329                                 if (attenuation < 0.01f)
4330                                         continue;
4331                         }
4332
4333                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4334                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4335                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4336                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4337                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4338                         {
4339                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4340                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4341                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4342                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4343                         }
4344                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4345                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4346                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4347                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4348
4349                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4350                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4351                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4352                         DPSOFTRAST_Vector3Normalize(lightnormal);
4353
4354                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4355                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4356                         {
4357                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4358                                 attenuation *= (1.0f / 255.0f);
4359                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4360                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4361                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4362                                 d[3] = (int)( diffusetex[3]                                                                                                                   );if (d[3] > 255) d[3] = 255;
4363                         }
4364                         else
4365                         {
4366                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4367                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4368                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4369                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4370                         }
4371                         buffer_FragColorbgra8[x*4+0] = d[0];
4372                         buffer_FragColorbgra8[x*4+1] = d[1];
4373                         buffer_FragColorbgra8[x*4+2] = d[2];
4374                         buffer_FragColorbgra8[x*4+3] = d[3];
4375                 }
4376         }
4377         else
4378         {
4379                 for (x = startx;x < endx;x++)
4380                 {
4381                         z = buffer_z[x];
4382                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4383                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4384                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4385                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4386                         if (attenuation < 0.01f)
4387                                 continue;
4388                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4389                         {
4390                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4391                                 if (attenuation < 0.01f)
4392                                         continue;
4393                         }
4394
4395                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4396                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4397                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4398                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4399                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4400                         {
4401                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4402                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4403                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4404                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4405                         }
4406                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4407                         {
4408                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4409                                 attenuation *= (1.0f / 255.0f);
4410                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4411                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4412                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4413                                 d[3] = (int)( diffusetex[3]                                                                                      );if (d[3] > 255) d[3] = 255;
4414                         }
4415                         else
4416                         {
4417                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4418                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4419                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4420                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4421                         }
4422                         buffer_FragColorbgra8[x*4+0] = d[0];
4423                         buffer_FragColorbgra8[x*4+1] = d[1];
4424                         buffer_FragColorbgra8[x*4+2] = d[2];
4425                         buffer_FragColorbgra8[x*4+3] = d[3];
4426                 }
4427         }
4428         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4429 #endif
4430 }
4431
4432
4433
4434 static void DPSOFTRAST_VertexShader_Refraction(void)
4435 {
4436         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4437         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4438         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4439 }
4440
4441 static void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4442 {
4443         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4444         float z;
4445         int x, startx = span->startx, endx = span->endx;
4446
4447         // texture reads
4448         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4449         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4450
4451         // varyings
4452         float ModelViewProjectionPositiondata[4];
4453         float ModelViewProjectionPositionslope[4];
4454
4455         // uniforms
4456         float ScreenScaleRefractReflect[2];
4457         float ScreenCenterRefractReflect[2];
4458         float DistortScaleRefractReflect[2];
4459         float RefractColor[4];
4460
4461         DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4462         if(!texture) return;
4463
4464         // read textures
4465         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4466         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4467
4468         // read varyings
4469         DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4470
4471         // read uniforms
4472         ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4473         ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4474         ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4475         ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4476         DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4477         DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4478         RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4479         RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4480         RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4481         RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4482
4483         // do stuff
4484         for (x = startx;x < endx;x++)
4485         {
4486                 float SafeScreenTexCoord[2];
4487                 float ScreenTexCoord[2];
4488                 float v[3];
4489                 float iw;
4490                 unsigned char c[4];
4491
4492                 z = buffer_z[x];
4493
4494                 // "    vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4495                 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4496
4497                 // "    vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4498                 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4499                 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4500
4501                 // "    vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4502                 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4503                 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4504                 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4505                 DPSOFTRAST_Vector3Normalize(v);
4506                 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4507                 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4508
4509                 // "    dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4510                 DPSOFTRAST_Texture2DBGRA8(texture, 0, ScreenTexCoord[0], ScreenTexCoord[1], c);
4511
4512                 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4513                 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4514                 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4515                 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4516         }
4517
4518         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4519 }
4520
4521
4522
4523 static void DPSOFTRAST_VertexShader_Water(void)
4524 {
4525         int i;
4526         int numvertices = dpsoftrast.numvertices;
4527         float EyePosition[4];
4528         float EyeVectorModelSpace[4];
4529         float EyeVector[4];
4530         float position[4];
4531         float svector[4];
4532         float tvector[4];
4533         float normal[4];
4534         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4535         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4536         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4537         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4538         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4539         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4540         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4541         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4542         for (i = 0;i < numvertices;i++)
4543         {
4544                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4545                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4546                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4547                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4548                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4549                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4550                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4551                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4552                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4553                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4554                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4555                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4556                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4557                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4558                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4559                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4560                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4561                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
4562                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
4563                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
4564                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
4565                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
4566         }
4567         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4568         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4569         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4570 }
4571
4572
4573 static void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4574 {
4575         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4576         float z;
4577         int x, startx = span->startx, endx = span->endx;
4578
4579         // texture reads
4580         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4581         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4582
4583         // varyings
4584         float ModelViewProjectionPositiondata[4];
4585         float ModelViewProjectionPositionslope[4];
4586         float EyeVectordata[4];
4587         float EyeVectorslope[4];
4588
4589         // uniforms
4590         float ScreenScaleRefractReflect[4];
4591         float ScreenCenterRefractReflect[4];
4592         float DistortScaleRefractReflect[4];
4593         float RefractColor[4];
4594         float ReflectColor[4];
4595         float ReflectFactor;
4596         float ReflectOffset;
4597
4598         DPSOFTRAST_Texture *texture_refraction = thread->texbound[GL20TU_REFRACTION];
4599         DPSOFTRAST_Texture *texture_reflection = thread->texbound[GL20TU_REFLECTION];
4600         if(!texture_refraction || !texture_reflection) return;
4601
4602         // read textures
4603         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4604         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4605
4606         // read varyings
4607         DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4608         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
4609
4610         // read uniforms
4611         ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4612         ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4613         ScreenScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+2];
4614         ScreenScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+3];
4615         ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4616         ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4617         ScreenCenterRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+2];
4618         ScreenCenterRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+3];
4619         DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4620         DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4621         DistortScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+2];
4622         DistortScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+3];
4623         RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4624         RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4625         RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4626         RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4627         ReflectColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+2];
4628         ReflectColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+1];
4629         ReflectColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+0];
4630         ReflectColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+3];
4631         ReflectFactor = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectFactor*4+0];
4632         ReflectOffset = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectOffset*4+0];
4633
4634         // do stuff
4635         for (x = startx;x < endx;x++)
4636         {
4637                 float SafeScreenTexCoord[4];
4638                 float ScreenTexCoord[4];
4639                 float v[3];
4640                 float iw;
4641                 unsigned char c1[4];
4642                 unsigned char c2[4];
4643                 float Fresnel;
4644
4645                 z = buffer_z[x];
4646
4647                 // "    vec4 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect * (1.0 / ModelViewProjectionPosition.w);\n"
4648                 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4649
4650                 // "    vec4 SafeScreenTexCoord = ModelViewProjectionPosition.xyxy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect;\n"
4651                 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4652                 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4653                 SafeScreenTexCoord[2] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[2] + ScreenCenterRefractReflect[2]; // * z (disappears)
4654                 SafeScreenTexCoord[3] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[3] + ScreenCenterRefractReflect[3]; // * z (disappears)
4655
4656                 // "    vec4 ScreenTexCoord = SafeScreenTexCoord + vec2(normalize(vec3(dp_texture2D(Texture_Normal, TexCoord)) - vec3(0.5))).xyxy * DistortScaleRefractReflect;\n"
4657                 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4658                 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4659                 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4660                 DPSOFTRAST_Vector3Normalize(v);
4661                 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4662                 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4663                 ScreenTexCoord[2] = SafeScreenTexCoord[2] + v[0] * DistortScaleRefractReflect[2];
4664                 ScreenTexCoord[3] = SafeScreenTexCoord[3] + v[1] * DistortScaleRefractReflect[3];
4665
4666                 // "    float Fresnel = pow(min(1.0, 1.0 - float(normalize(EyeVector).z)), 2.0) * ReflectFactor + ReflectOffset;\n"
4667                 v[0] = (EyeVectordata[0] + EyeVectorslope[0] * x); // * z (disappears)
4668                 v[1] = (EyeVectordata[1] + EyeVectorslope[1] * x); // * z (disappears)
4669                 v[2] = (EyeVectordata[2] + EyeVectorslope[2] * x); // * z (disappears)
4670                 DPSOFTRAST_Vector3Normalize(v);
4671                 Fresnel = 1.0f - v[2];
4672                 Fresnel = min(1.0f, Fresnel);
4673                 Fresnel = Fresnel * Fresnel * ReflectFactor + ReflectOffset;
4674
4675                 // "    dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4676                 // "    dp_FragColor = mix(vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord.xy).rgb, 1) * RefractColor, vec4(dp_texture2D(Texture_Reflection, ScreenTexCoord.zw).rgb, 1) * ReflectColor, Fresnel);\n"
4677                 DPSOFTRAST_Texture2DBGRA8(texture_refraction, 0, ScreenTexCoord[0], ScreenTexCoord[1], c1);
4678                 DPSOFTRAST_Texture2DBGRA8(texture_reflection, 0, ScreenTexCoord[2], ScreenTexCoord[3], c2);
4679
4680                 buffer_FragColorbgra8[x*4+0] = (c1[0] * RefractColor[0]) * (1.0f - Fresnel) + (c2[0] * ReflectColor[0]) * Fresnel;
4681                 buffer_FragColorbgra8[x*4+1] = (c1[1] * RefractColor[1]) * (1.0f - Fresnel) + (c2[1] * ReflectColor[1]) * Fresnel;
4682                 buffer_FragColorbgra8[x*4+2] = (c1[2] * RefractColor[2]) * (1.0f - Fresnel) + (c2[2] * ReflectColor[2]) * Fresnel;
4683                 buffer_FragColorbgra8[x*4+3] = min((    RefractColor[3] *  (1.0f - Fresnel) +          ReflectColor[3]  * Fresnel) * 256, 255);
4684         }
4685
4686         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4687 }
4688
4689
4690
4691 static void DPSOFTRAST_VertexShader_ShowDepth(void)
4692 {
4693         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4694 }
4695
4696 static void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4697 {
4698         // TODO: IMPLEMENT
4699         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4700         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4701         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4702         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4703         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4704 }
4705
4706
4707
4708 static void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4709 {
4710         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4711 }
4712
4713 static void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4714 {
4715         // TODO: IMPLEMENT
4716         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4717         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4718         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4719         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4720         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4721 }
4722
4723
4724
4725 static void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4726 {
4727         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4728 }
4729
4730 static void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4731 {
4732         // TODO: IMPLEMENT
4733         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4734         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4735         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4736         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4737         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4738 }
4739
4740
4741
4742 typedef struct DPSOFTRAST_ShaderModeInfo_s
4743 {
4744         int lodarrayindex;
4745         void (*Vertex)(void);
4746         void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4747         unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4748         unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4749 }
4750 DPSOFTRAST_ShaderModeInfo;
4751
4752 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4753 {
4754         {2, DPSOFTRAST_VertexShader_Generic,                        DPSOFTRAST_PixelShader_Generic,                        {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4755         {2, DPSOFTRAST_VertexShader_PostProcess,                    DPSOFTRAST_PixelShader_PostProcess,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4756         {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow,                DPSOFTRAST_PixelShader_Depth_Or_Shadow,                {~0}, {~0}},
4757         {2, DPSOFTRAST_VertexShader_FlatColor,                      DPSOFTRAST_PixelShader_FlatColor,                      {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4758         {2, DPSOFTRAST_VertexShader_VertexColor,                    DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4759         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4760         {2, DPSOFTRAST_VertexShader_FakeLight,                      DPSOFTRAST_PixelShader_FakeLight,                      {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4761         {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace,   {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4762         {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4763         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4764         {2, DPSOFTRAST_VertexShader_VertexColor,                        DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4765         {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,                 {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4766         {2, DPSOFTRAST_VertexShader_LightSource,                    DPSOFTRAST_PixelShader_LightSource,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4767         {2, DPSOFTRAST_VertexShader_Refraction,                     DPSOFTRAST_PixelShader_Refraction,                     {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4768         {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                          {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_NORMAL, GL20TU_REFLECTION, GL20TU_REFRACTION, ~0}},
4769         {2, DPSOFTRAST_VertexShader_ShowDepth,                      DPSOFTRAST_PixelShader_ShowDepth,                      {~0}},
4770         {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,               {~0}},
4771         {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}},
4772 };
4773
4774 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span)
4775 {
4776         int x;
4777         int startx;
4778         int endx;
4779         unsigned int *depthpixel;
4780         int depth;
4781         int depthslope;
4782         unsigned int d;
4783         unsigned char *pixelmask;
4784         DPSOFTRAST_State_Triangle *triangle;
4785         triangle = &thread->triangles[span->triangle];
4786         depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4787         startx = span->startx;
4788         endx = span->endx;
4789         depth = span->depthbase;
4790         depthslope = span->depthslope;
4791         pixelmask = thread->pixelmaskarray;
4792         if (thread->depthtest && dpsoftrast.fb_depthpixels)
4793         {
4794                 switch(thread->fb_depthfunc)
4795                 {
4796                 default:
4797                 case GL_ALWAYS:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4798                 case GL_LESS:    for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4799                 case GL_LEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4800                 case GL_EQUAL:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4801                 case GL_GEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4802                 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4803                 case GL_NEVER:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4804                 }
4805                 while (startx < endx && !pixelmask[startx])
4806                         startx++;
4807                 while (endx > startx && !pixelmask[endx-1])
4808                         endx--;
4809         }
4810         else
4811         {
4812                 // no depth testing means we're just dealing with color...
4813                 memset(pixelmask + startx, 1, endx - startx);
4814         }
4815         span->pixelmask = pixelmask;
4816         span->startx = startx;
4817         span->endx = endx;
4818 }
4819
4820 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span)
4821 {
4822         int x, d, depth, depthslope, startx, endx;
4823         const unsigned char *pixelmask;
4824         unsigned int *depthpixel;
4825         if (thread->depthmask && thread->depthtest && dpsoftrast.fb_depthpixels)
4826         {
4827                 depth = span->depthbase;
4828                 depthslope = span->depthslope;
4829                 pixelmask = span->pixelmask;
4830                 startx = span->startx;
4831                 endx = span->endx;
4832                 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4833                 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4834                         if (pixelmask[x])
4835                                 depthpixel[x] = d;
4836         }
4837 }
4838
4839 static void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4840 {
4841         int i;
4842         DPSOFTRAST_State_Triangle *triangle;
4843         DPSOFTRAST_State_Span *span;
4844         for (i = 0; i < thread->numspans; i++)
4845         {
4846                 span = &thread->spans[i];
4847                 triangle = &thread->triangles[span->triangle];
4848                 DPSOFTRAST_Draw_DepthTest(thread, span);
4849                 if (span->startx >= span->endx)
4850                         continue;
4851                 // run pixel shader if appropriate
4852                 // do this before running depthmask code, to allow the pixelshader
4853                 // to clear pixelmask values for alpha testing
4854                 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4855                         DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4856                 DPSOFTRAST_Draw_DepthWrite(thread, span);
4857         }
4858         thread->numspans = 0;
4859 }
4860
4861 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;)
4862
4863 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4864 {
4865 #ifdef SSE_POSSIBLE
4866         int cullface = thread->cullface;
4867         int minx, maxx, miny, maxy;
4868         int miny1, maxy1, miny2, maxy2;
4869         __m128i fbmin, fbmax;
4870         __m128 viewportcenter, viewportscale;
4871         int firstvertex = command->firstvertex;
4872         int numvertices = command->numvertices;
4873         int numtriangles = command->numtriangles;
4874         const int *element3i = command->element3i;
4875         const unsigned short *element3s = command->element3s;
4876         int clipped = command->clipped;
4877         int i;
4878         int j;
4879         int k;
4880         int y;
4881         int e[3];
4882         __m128i screeny;
4883         int starty, endy, bandy;
4884         int numpoints;
4885         int clipcase;
4886         float clipdist[4];
4887         float clip0origin, clip0slope;
4888         int clip0dir;
4889         __m128 triangleedge1, triangleedge2, trianglenormal;
4890         __m128 clipfrac[3];
4891         __m128 screen[4];
4892         DPSOFTRAST_State_Triangle *triangle;
4893         DPSOFTRAST_Texture *texture;
4894         DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4895         miny = thread->fb_scissor[1];
4896         maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4897         miny1 = bound(miny, thread->miny1, maxy);
4898         maxy1 = bound(miny, thread->maxy1, maxy);
4899         miny2 = bound(miny, thread->miny2, maxy);
4900         maxy2 = bound(miny, thread->maxy2, maxy);
4901         if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4902         {
4903                 if (!ATOMIC_DECREMENT(command->refcount))
4904                 {
4905                         if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4906                                 MM_FREE(command->arrays);
4907                 }
4908                 return;
4909         }
4910         minx = thread->fb_scissor[0];
4911         maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4912         fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4913         fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4914         viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4915         viewportscale = _mm_load_ps(thread->fb_viewportscale);
4916         screen[3] = _mm_setzero_ps();
4917         clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4918         for (i = 0;i < numtriangles;i++)
4919         {
4920                 const float *screencoord4f = command->arrays;
4921                 const float *arrays = screencoord4f + numvertices*4;
4922
4923                 // generate the 3 edges of this triangle
4924                 // generate spans for the triangle - switch based on left split or right split classification of triangle
4925                 if (element3s)
4926                 {
4927                         e[0] = element3s[i*3+0] - firstvertex;
4928                         e[1] = element3s[i*3+1] - firstvertex;
4929                         e[2] = element3s[i*3+2] - firstvertex;
4930                 }
4931                 else if (element3i)
4932                 {
4933                         e[0] = element3i[i*3+0] - firstvertex;
4934                         e[1] = element3i[i*3+1] - firstvertex;
4935                         e[2] = element3i[i*3+2] - firstvertex;
4936                 }
4937                 else
4938                 {
4939                         e[0] = i*3+0;
4940                         e[1] = i*3+1;
4941                         e[2] = i*3+2;
4942                 }
4943
4944 #define SKIPBACKFACE \
4945                 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4946                 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4947                 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4948                 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4949                                                                         _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4950                 switch(cullface) \
4951                 { \
4952                 case GL_BACK: \
4953                         if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4954                                 continue; \
4955                         break; \
4956                 case GL_FRONT: \
4957                         if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4958                                 continue; \
4959                         break; \
4960                 }
4961
4962 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4963                         clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4964                         { \
4965                                 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4966                                 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4967                         }
4968 #define CLIPPEDVERTEXCOPY(k,p1) \
4969                         screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4970
4971 #define GENATTRIBCOPY(attrib, p1) \
4972                 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4973 #define GENATTRIBLERP(attrib, p1, p2) \
4974                 { \
4975                         __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4976                         attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4977                 }
4978 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4979                 switch(clipcase) \
4980                 { \
4981                 default: \
4982                 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4983                 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4984                 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4985                 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4986                 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4987                 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4988                 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4989                 }
4990
4991                 if (! clipped)
4992                         goto notclipped;
4993
4994                 // calculate distance from nearplane
4995                 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4996                 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4997                 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4998                 if (clipdist[0] >= 0.0f)
4999                 {
5000                         if (clipdist[1] >= 0.0f)
5001                         {
5002                                 if (clipdist[2] >= 0.0f)
5003                                 {
5004                                 notclipped:
5005                                         // triangle is entirely in front of nearplane
5006                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
5007                                         SKIPBACKFACE;
5008                                         numpoints = 3;
5009                                         clipcase = 0;
5010                                 }
5011                                 else
5012                                 {
5013                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
5014                                         SKIPBACKFACE;
5015                                         numpoints = 4;
5016                                         clipcase = 1;
5017                                 }
5018                         }
5019                         else
5020                         {
5021                                 if (clipdist[2] >= 0.0f)
5022                                 {
5023                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
5024                                         SKIPBACKFACE;
5025                                         numpoints = 4;
5026                                         clipcase = 2;
5027                                 }
5028                                 else
5029                                 {
5030                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
5031                                         SKIPBACKFACE;
5032                                         numpoints = 3;
5033                                         clipcase = 3;
5034                                 }
5035                         }
5036                 }
5037                 else if (clipdist[1] >= 0.0f)
5038                 {
5039                         if (clipdist[2] >= 0.0f)
5040                         {
5041                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
5042                                 SKIPBACKFACE;
5043                                 numpoints = 4;
5044                                 clipcase = 4;
5045                         }
5046                         else
5047                         {
5048                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
5049                                 SKIPBACKFACE;
5050                                 numpoints = 3;
5051                                 clipcase = 5;
5052                         }
5053                 }
5054                 else if (clipdist[2] >= 0.0f)
5055                 {
5056                         CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
5057                         SKIPBACKFACE;
5058                         numpoints = 3;
5059                         clipcase = 6;
5060                 }
5061                 else continue; // triangle is entirely behind nearplane
5062
5063                 {
5064                         // calculate integer y coords for triangle points
5065                         __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
5066                                         screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
5067                                         screenmin = _mm_min_epi16(screeni, screenir),
5068                                         screenmax = _mm_max_epi16(screeni, screenir);
5069                         screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
5070                         screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
5071                         screenmin = _mm_max_epi16(screenmin, fbmin);
5072                         screenmax = _mm_min_epi16(screenmax, fbmax);
5073                         // skip offscreen triangles
5074                         if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
5075                                 continue;
5076                         starty = _mm_extract_epi16(screenmin, 1);
5077                         endy = _mm_extract_epi16(screenmax, 1)+1;
5078                         if (starty >= maxy1 && endy <= miny2)
5079                                 continue;
5080                         screeny = _mm_srai_epi32(screeni, 16);
5081                 }
5082
5083                 triangle = &thread->triangles[thread->numtriangles];
5084
5085                 // calculate attribute plans for triangle data...
5086                 // okay, this triangle is going to produce spans, we'd better project
5087                 // the interpolants now (this is what gives perspective texturing),
5088                 // this consists of simply multiplying all arrays by the W coord
5089                 // (which is basically 1/Z), which will be undone per-pixel
5090                 // (multiplying by Z again) to get the perspective-correct array
5091                 // values
5092                 {
5093                         __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
5094                         __m128 mipedgescale, mipdensity;
5095                         attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
5096                         attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
5097                         attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
5098                         attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
5099                         attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
5100                         w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
5101                         w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
5102                         w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
5103                         attribedge1 = _mm_sub_ss(w0, w1);
5104                         attribedge2 = _mm_sub_ss(w2, w1);
5105                         attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5106                         attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5107                         x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
5108                         y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
5109                         attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5110                         _mm_store_ss(&triangle->w[0], attribxslope);
5111                         _mm_store_ss(&triangle->w[1], attribyslope);
5112                         _mm_store_ss(&triangle->w[2], attriborigin);
5113                         
5114                         clip0origin = 0;
5115                         clip0slope = 0;
5116                         clip0dir = 0;
5117                         if(thread->fb_clipplane[0] || thread->fb_clipplane[1] || thread->fb_clipplane[2])
5118                         {
5119                                 float cliporigin, clipxslope, clipyslope;
5120                                 attriborigin = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(2, 2, 2, 2));
5121                                 attribedge1 = _mm_sub_ss(_mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5122                                 attribedge2 = _mm_sub_ss(_mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5123                                 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5124                                 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5125                                 attriborigin = _mm_sub_ss(attriborigin, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5126                                 cliporigin = _mm_cvtss_f32(attriborigin)*thread->fb_clipplane[2] + thread->fb_clipplane[3];
5127                                 clipxslope = thread->fb_clipplane[0] + _mm_cvtss_f32(attribxslope)*thread->fb_clipplane[2];
5128                                 clipyslope = thread->fb_clipplane[1] + _mm_cvtss_f32(attribyslope)*thread->fb_clipplane[2];
5129                                 if(clipxslope != 0)
5130                                 {
5131                                         clip0origin = -cliporigin/clipxslope;
5132                                         clip0slope = -clipyslope/clipxslope;
5133                                         clip0dir = clipxslope > 0 ? 1 : -1;
5134                                 }
5135                                 else if(clipyslope > 0)
5136                                 {
5137                                         clip0origin = dpsoftrast.fb_width*floor(cliporigin/clipyslope);
5138                                         clip0slope = dpsoftrast.fb_width;
5139                                         clip0dir = -1;
5140                                 }
5141                                 else if(clipyslope < 0)
5142                                 {
5143                                         clip0origin = dpsoftrast.fb_width*ceil(cliporigin/clipyslope);
5144                                         clip0slope = -dpsoftrast.fb_width;
5145                                         clip0dir = -1;
5146                                 }
5147                                 else if(clip0origin < 0) continue;
5148                         }
5149
5150                         mipedgescale = _mm_setzero_ps();
5151                         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
5152                         {
5153                                 __m128 attrib0, attrib1, attrib2;
5154                                 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
5155                                 if (k >= DPSOFTRAST_ARRAY_TOTAL)
5156                                         break;
5157                                 arrays += numvertices*4;
5158                                 GENATTRIBS(attrib0, attrib1, attrib2);
5159                                 attriborigin = _mm_mul_ps(attrib1, w1);
5160                                 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
5161                                 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
5162                                 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
5163                                 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
5164                                 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
5165                                 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
5166                                 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
5167                                 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
5168                                 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
5169                                 {
5170                                         mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
5171                                         mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
5172                                         mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
5173                                         mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
5174                                 }
5175                         }
5176
5177                         memset(triangle->mip, 0, sizeof(triangle->mip));
5178                         for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
5179                         {
5180                                 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
5181                                 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
5182                                         break;
5183                                 texture = thread->texbound[texunit];
5184                                 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
5185                                 {
5186                                         mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
5187                                         mipdensity = _mm_mul_ps(mipdensity, mipdensity);
5188                                         mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
5189                                         mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
5190                                         // this will be multiplied in the texturing routine by the texture resolution
5191                                         y = _mm_cvtss_si32(mipdensity);
5192                                         if (y > 0)
5193                                         {
5194                                                 y = (int)(log((float)y)*0.5f/M_LN2);
5195                                                 if (y > texture->mipmaps - 1)
5196                                                         y = texture->mipmaps - 1;
5197                                                 triangle->mip[texunit] = y;
5198                                         }
5199                                 }
5200                         }
5201                 }
5202         
5203                 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
5204                 for (; y < bandy;)
5205                 {
5206                         __m128 xcoords, xslope;
5207                         __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
5208                         int yccmask = _mm_movemask_epi8(ycc);
5209                         int edge0p, edge0n, edge1p, edge1n;
5210                         int nexty;
5211                         float w, wslope;
5212                         float clip0;
5213                         if (numpoints == 4)
5214                         {
5215                                 switch(yccmask)
5216                                 {
5217                                 default:
5218                                 case 0xFFFF: /*0000*/ y = endy; continue;
5219                                 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
5220                                 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5221                                 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
5222                                 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
5223                                 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5224                                 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5225                                 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5226                                 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5227                                 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5228                                 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5229                                 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5230                                 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5231                                 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5232                                 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5233                                 case 0x0000: /*1111*/ y++; continue;
5234                                 }
5235                         }
5236                         else
5237                         {
5238                                 switch(yccmask)
5239                                 {
5240                                 default:
5241                                 case 0xFFFF: /*000*/ y = endy; continue;
5242                                 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5243                                 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5244                                 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5245                                 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5246                                 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5247                                 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5248                                 case 0x0000: /*111*/ y++; continue;
5249                                 }
5250                         }
5251                         ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5252                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5253                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5254                         nexty = _mm_extract_epi16(ycc, 0);
5255                         if (nexty >= bandy) nexty = bandy-1;
5256                         xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5257                         xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5258                         xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5259                                                                 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5260                         xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5261                         if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5262                         {
5263                                 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5264                                 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5265                         }
5266                         clip0 = clip0origin + (y+0.5f)*clip0slope + 0.5f;
5267                         for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope), clip0 += clip0slope)
5268                         {
5269                                 int startx, endx, offset;
5270                                 startx = _mm_cvtss_si32(xcoords);
5271                                 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5272                                 if (startx < minx) startx = minx;
5273                                 if (endx > maxx) endx = maxx;
5274                                 if (startx >= endx) continue;
5275
5276                                 if (clip0dir)
5277                                 {
5278                                         if (clip0dir > 0)
5279                                         {
5280                                                 if (startx < clip0) 
5281                                                 {
5282                                                         if(endx <= clip0) continue;
5283                                                         startx = (int)clip0;
5284                                                 }
5285                                         }
5286                                         else if (endx > clip0) 
5287                                         {
5288                                                 if(startx >= clip0) continue;
5289                                                 endx = (int)clip0;
5290                                         }
5291                                 }
5292                                                 
5293                                 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5294                                 {
5295                                         DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5296                                         span->triangle = thread->numtriangles;
5297                                         span->x = offset;
5298                                         span->y = y;
5299                                         span->startx = 0;
5300                                         span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5301                                         if (span->startx >= span->endx)
5302                                                 continue;
5303                                         wslope = triangle->w[0];
5304                                         w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
5305                                         span->depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
5306                                         span->depthbase = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
5307                                         if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5308                                                 DPSOFTRAST_Draw_ProcessSpans(thread);
5309                                 }
5310                         }
5311                 }
5312
5313                 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5314                 {
5315                         DPSOFTRAST_Draw_ProcessSpans(thread);
5316                         thread->numtriangles = 0;
5317                 }
5318         }
5319
5320         if (!ATOMIC_DECREMENT(command->refcount))
5321         {
5322                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5323                         MM_FREE(command->arrays);
5324         }
5325
5326         if (thread->numspans > 0 || thread->numtriangles > 0)
5327         {
5328                 DPSOFTRAST_Draw_ProcessSpans(thread);
5329                 thread->numtriangles = 0;
5330         }
5331 #endif
5332 }
5333
5334 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5335 {
5336         int i;
5337         int j;
5338         int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5339         int datasize = 2*numvertices*sizeof(float[4]);
5340         DPSOFTRAST_Command_Draw *command;
5341         unsigned char *data;
5342         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5343         {
5344                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5345                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5346                         break;
5347                 datasize += numvertices*sizeof(float[4]);
5348         }
5349         if (element3s)
5350                 datasize += numtriangles*sizeof(unsigned short[3]);
5351         else if (element3i)
5352                 datasize += numtriangles*sizeof(int[3]);
5353         datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5354         if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5355         {
5356                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5357                 data = (unsigned char *)MM_CALLOC(datasize, 1);
5358         }
5359         else
5360         {
5361                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5362                 data = (unsigned char *)command + commandsize;
5363         }
5364         command->firstvertex = firstvertex;
5365         command->numvertices = numvertices;
5366         command->numtriangles = numtriangles;
5367         command->arrays = (float *)data;
5368         memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5369         dpsoftrast.firstvertex = firstvertex;
5370         dpsoftrast.numvertices = numvertices;
5371         dpsoftrast.screencoord4f = (float *)data;
5372         data += numvertices*sizeof(float[4]);
5373         dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5374         data += numvertices*sizeof(float[4]);
5375         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5376         {
5377                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5378                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5379                         break;
5380                 dpsoftrast.post_array4f[j] = (float *)data;
5381                 data += numvertices*sizeof(float[4]);
5382         }
5383         command->element3i = NULL;
5384         command->element3s = NULL;
5385         if (element3s)
5386         {
5387                 command->element3s = (unsigned short *)data;
5388                 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5389         }
5390         else if (element3i)
5391         {
5392                 command->element3i = (int *)data;
5393                 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5394         }
5395         return command;
5396 }
5397
5398 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5399 {
5400         DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5401         DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5402         command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5403         command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5404         if (command->starty >= command->endy)
5405         {
5406                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5407                         MM_FREE(command->arrays);
5408                 DPSOFTRAST_UndoCommand(command->commandsize);
5409                 return;
5410         }
5411         command->clipped = dpsoftrast.drawclipped;
5412         command->refcount = dpsoftrast.numthreads;
5413
5414         if (dpsoftrast.usethreads)
5415         {
5416                 int i;
5417                 DPSOFTRAST_Draw_SyncCommands();
5418                 for (i = 0; i < dpsoftrast.numthreads; i++)
5419                 {
5420                         DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5421                         if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5422                                 Thread_CondSignal(thread->drawcond);
5423                 }
5424         }
5425         else
5426         {
5427                 DPSOFTRAST_Draw_FlushThreads();
5428         }
5429 }
5430
5431 DEFCOMMAND(23, SetRenderTargets, int width; int height;)
5432 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5433 {
5434         thread->validate |= DPSOFTRAST_VALIDATE_FB;
5435 }
5436 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5437 {
5438         DPSOFTRAST_Command_SetRenderTargets *command;
5439         if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5440                 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5441                 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5442                 DPSOFTRAST_Flush();
5443         dpsoftrast.fb_width = width;
5444         dpsoftrast.fb_height = height;
5445         dpsoftrast.fb_depthpixels = depthpixels;
5446         dpsoftrast.fb_colorpixels[0] = colorpixels0;
5447         dpsoftrast.fb_colorpixels[1] = colorpixels1;
5448         dpsoftrast.fb_colorpixels[2] = colorpixels2;
5449         dpsoftrast.fb_colorpixels[3] = colorpixels3;
5450         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5451         command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5452         command->width = width;
5453         command->height = height;
5454 }
5455  
5456 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5457 {
5458         int commandoffset = thread->commandoffset;
5459         while (commandoffset != endoffset)
5460         {
5461                 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5462                 switch (command->opcode)
5463                 {
5464 #define INTERPCOMMAND(name) \
5465                 case DPSOFTRAST_OPCODE_##name : \
5466                         DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5467                         commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5468                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5469                                 commandoffset = 0; \
5470                         break;
5471                 INTERPCOMMAND(Viewport)
5472                 INTERPCOMMAND(ClearColor)
5473                 INTERPCOMMAND(ClearDepth)
5474                 INTERPCOMMAND(ColorMask)
5475                 INTERPCOMMAND(DepthTest)
5476                 INTERPCOMMAND(ScissorTest)
5477                 INTERPCOMMAND(Scissor)
5478                 INTERPCOMMAND(BlendFunc)
5479                 INTERPCOMMAND(BlendSubtract)
5480                 INTERPCOMMAND(DepthMask)
5481                 INTERPCOMMAND(DepthFunc)
5482                 INTERPCOMMAND(DepthRange)
5483                 INTERPCOMMAND(PolygonOffset)
5484                 INTERPCOMMAND(CullFace)
5485                 INTERPCOMMAND(SetTexture)
5486                 INTERPCOMMAND(SetShader)
5487                 INTERPCOMMAND(Uniform4f)
5488                 INTERPCOMMAND(UniformMatrix4f)
5489                 INTERPCOMMAND(Uniform1i)
5490                 INTERPCOMMAND(SetRenderTargets)
5491                 INTERPCOMMAND(ClipPlane)
5492
5493                 case DPSOFTRAST_OPCODE_Draw:
5494                         DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5495                         commandoffset += command->commandsize;
5496                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5497                                 commandoffset = 0;
5498                         thread->commandoffset = commandoffset;
5499                         break;
5500
5501                 case DPSOFTRAST_OPCODE_Reset:
5502                         commandoffset = 0;
5503                         break;
5504                 }
5505         }
5506         thread->commandoffset = commandoffset;
5507 }
5508
5509 static int DPSOFTRAST_Draw_Thread(void *data)
5510 {
5511         DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5512         while(thread->index >= 0)
5513         {
5514                 if (thread->commandoffset != dpsoftrast.drawcommand)
5515                 {
5516                         DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);      
5517                 }
5518                 else 
5519                 {
5520                         Thread_LockMutex(thread->drawmutex);
5521                         if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5522                         {
5523                                 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5524                                 thread->starving = true;
5525                                 Thread_CondWait(thread->drawcond, thread->drawmutex);
5526                                 thread->starving = false;
5527                         }
5528                         Thread_UnlockMutex(thread->drawmutex);
5529                 }
5530         }   
5531         return 0;
5532 }
5533
5534 static void DPSOFTRAST_Draw_FlushThreads(void)
5535 {
5536         DPSOFTRAST_State_Thread *thread;
5537         int i;
5538         DPSOFTRAST_Draw_SyncCommands();
5539         if (dpsoftrast.usethreads) 
5540         {
5541                 for (i = 0; i < dpsoftrast.numthreads; i++)
5542                 {
5543                         thread = &dpsoftrast.threads[i];
5544                         if (thread->commandoffset != dpsoftrast.drawcommand)
5545                         {
5546                                 Thread_LockMutex(thread->drawmutex);
5547                                 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5548                                         Thread_CondSignal(thread->drawcond);
5549                                 Thread_UnlockMutex(thread->drawmutex);
5550                         }
5551                 }
5552                 for (i = 0; i < dpsoftrast.numthreads; i++)
5553                 {
5554                         thread = &dpsoftrast.threads[i];
5555                         if (thread->commandoffset != dpsoftrast.drawcommand)
5556                         {
5557                                 Thread_LockMutex(thread->drawmutex);
5558                                 if (thread->commandoffset != dpsoftrast.drawcommand)
5559                                 {
5560                                         thread->waiting = true;
5561                                         Thread_CondWait(thread->waitcond, thread->drawmutex);
5562                                         thread->waiting = false;
5563                                 }
5564                                 Thread_UnlockMutex(thread->drawmutex);
5565                         }
5566                 }
5567         }
5568         else
5569         {
5570                 for (i = 0; i < dpsoftrast.numthreads; i++)
5571                 {
5572                         thread = &dpsoftrast.threads[i];
5573                         if (thread->commandoffset != dpsoftrast.drawcommand)
5574                                 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5575                 }
5576         }
5577         dpsoftrast.commandpool.usedcommands = 0;
5578 }
5579
5580 void DPSOFTRAST_Flush(void)
5581 {
5582         DPSOFTRAST_Draw_FlushThreads();
5583 }
5584
5585 void DPSOFTRAST_Finish(void)
5586 {
5587         DPSOFTRAST_Flush();
5588 }
5589
5590 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5591 {
5592         int i;
5593         union
5594         {
5595                 int i;
5596                 unsigned char b[4];
5597         }
5598         u;
5599         u.i = 1;
5600         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5601         dpsoftrast.bigendian = u.b[3];
5602         dpsoftrast.fb_width = width;
5603         dpsoftrast.fb_height = height;
5604         dpsoftrast.fb_depthpixels = depthpixels;
5605         dpsoftrast.fb_colorpixels[0] = colorpixels;
5606         dpsoftrast.fb_colorpixels[1] = NULL;
5607         dpsoftrast.fb_colorpixels[1] = NULL;
5608         dpsoftrast.fb_colorpixels[1] = NULL;
5609         dpsoftrast.viewport[0] = 0;
5610         dpsoftrast.viewport[1] = 0;
5611         dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5612         dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5613         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5614         dpsoftrast.texture_firstfree = 1;
5615         dpsoftrast.texture_end = 1;
5616         dpsoftrast.texture_max = 0;
5617         dpsoftrast.color[0] = 1;
5618         dpsoftrast.color[1] = 1;
5619         dpsoftrast.color[2] = 1;
5620         dpsoftrast.color[3] = 1;
5621         dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5622         dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5623         dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5624         dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5625         for (i = 0; i < dpsoftrast.numthreads; i++)
5626         {
5627                 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5628                 thread->index = i;
5629                 thread->cullface = GL_BACK;
5630         thread->colormask[0] = 1; 
5631                 thread->colormask[1] = 1;
5632                 thread->colormask[2] = 1;
5633                 thread->colormask[3] = 1;
5634                 thread->blendfunc[0] = GL_ONE;
5635                 thread->blendfunc[1] = GL_ZERO;
5636                 thread->depthmask = true;
5637                 thread->depthtest = true;
5638                 thread->depthfunc = GL_LEQUAL;
5639                 thread->scissortest = false;
5640                 thread->viewport[0] = 0;
5641                 thread->viewport[1] = 0;
5642                 thread->viewport[2] = dpsoftrast.fb_width;
5643                 thread->viewport[3] = dpsoftrast.fb_height;
5644                 thread->scissor[0] = 0;
5645                 thread->scissor[1] = 0;
5646                 thread->scissor[2] = dpsoftrast.fb_width;
5647                 thread->scissor[3] = dpsoftrast.fb_height;
5648                 thread->depthrange[0] = 0;
5649                 thread->depthrange[1] = 1;
5650                 thread->polygonoffset[0] = 0;
5651                 thread->polygonoffset[1] = 0;
5652                 thread->clipplane[0] = 0;
5653                 thread->clipplane[1] = 0;
5654                 thread->clipplane[2] = 0;
5655                 thread->clipplane[3] = 1;
5656         
5657                 thread->numspans = 0;
5658                 thread->numtriangles = 0;
5659                 thread->commandoffset = 0;
5660                 thread->waiting = false;
5661                 thread->starving = false;
5662            
5663                 thread->validate = -1;
5664                 DPSOFTRAST_Validate(thread, -1);
5665  
5666                 if (dpsoftrast.usethreads)
5667                 {
5668                         thread->waitcond = Thread_CreateCond();
5669                         thread->drawcond = Thread_CreateCond();
5670                         thread->drawmutex = Thread_CreateMutex();
5671                         thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5672                 }
5673         }
5674         return 0;
5675 }
5676
5677 void DPSOFTRAST_Shutdown(void)
5678 {
5679         int i;
5680         if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5681         {
5682                 DPSOFTRAST_State_Thread *thread;
5683                 for (i = 0; i < dpsoftrast.numthreads; i++)
5684                 {
5685                         thread = &dpsoftrast.threads[i];
5686                         Thread_LockMutex(thread->drawmutex);
5687                         thread->index = -1;
5688                         Thread_CondSignal(thread->drawcond);
5689                         Thread_UnlockMutex(thread->drawmutex);
5690                         Thread_WaitThread(thread->thread, 0);
5691                         Thread_DestroyCond(thread->waitcond);
5692                         Thread_DestroyCond(thread->drawcond);
5693                         Thread_DestroyMutex(thread->drawmutex);
5694                 }
5695         }
5696         for (i = 0;i < dpsoftrast.texture_end;i++)
5697                 if (dpsoftrast.texture[i].bytes)
5698                         MM_FREE(dpsoftrast.texture[i].bytes);
5699         if (dpsoftrast.texture)
5700                 free(dpsoftrast.texture);
5701         if (dpsoftrast.threads)
5702                 MM_FREE(dpsoftrast.threads);
5703         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5704 }
5705