]> git.xonotic.org Git - xonotic/darkplaces.git/blob - dpsoftrast.c
ALIGNED_SIZE -> ALIGN_SIZE
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "thread.h"
7 #include "dpsoftrast.h"
8
9 #ifdef _MSC_VER
10 #pragma warning(disable : 4324)
11 #endif
12
13 #ifndef __cplusplus
14 typedef qboolean bool;
15 #endif
16
17 #define ALIGN_SIZE 16
18 #define ATOMIC_SIZE 4
19
20 #ifdef SSE_POSSIBLE
21         #if defined(__APPLE__)
22                 #include <libkern/OSAtomic.h>
23                 #define ALIGN(var) var __attribute__((__aligned__(16)))
24                 #define ATOMIC(var) var __attribute__((__aligned__(4)))
25                 #define MEMORY_BARRIER (_mm_sfence())
26                 #define ATOMIC_COUNTER volatile int32_t 
27                 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28                 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29                 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30         #elif defined(__GNUC__) && defined(WIN32)
31                 #define ALIGN(var) var __attribute__((__aligned__(16)))
32                 #define ATOMIC(var) var __attribute__((__aligned__(4)))
33                 #define MEMORY_BARRIER (_mm_sfence())
34                 //(__sync_synchronize())
35                 #define ATOMIC_COUNTER volatile LONG
36                 // this LONG * cast serves to fix an issue with broken mingw
37                 // packages on Ubuntu; these only declare the function to take
38                 // a LONG *, causing a compile error here. This seems to be
39                 // error- and warn-free on platforms that DO declare
40                 // InterlockedIncrement correctly, like mingw on Windows.
41                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement((LONG *) &(counter)))
42                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement((LONG *) &(counter)))
43                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd((LONG *) &(counter), (val)))
44         #elif defined(__GNUC__)
45                 #define ALIGN(var) var __attribute__((__aligned__(16)))
46                 #define ATOMIC(var) var __attribute__((__aligned__(4)))
47                 #define MEMORY_BARRIER (_mm_sfence())
48                 //(__sync_synchronize())
49                 #define ATOMIC_COUNTER volatile int
50                 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
51                 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
52                 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
53         #elif defined(_MSC_VER)
54                 #define ALIGN(var) __declspec(align(16)) var
55                 #define ATOMIC(var) __declspec(align(4)) var
56                 #define MEMORY_BARRIER (_mm_sfence())
57                 //(MemoryBarrier())
58                 #define ATOMIC_COUNTER volatile LONG
59                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
60                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
61                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
62         #endif
63 #endif
64
65 #ifndef ALIGN
66 #define ALIGN(var) var
67 #endif
68 #ifndef ATOMIC
69 #define ATOMIC(var) var
70 #endif
71 #ifndef MEMORY_BARRIER
72 #define MEMORY_BARRIER ((void)0)
73 #endif
74 #ifndef ATOMIC_COUNTER
75 #define ATOMIC_COUNTER int
76 #endif
77 #ifndef ATOMIC_INCREMENT
78 #define ATOMIC_INCREMENT(counter) (++(counter))
79 #endif
80 #ifndef ATOMIC_DECREMENT
81 #define ATOMIC_DECREMENT(counter) (--(counter))
82 #endif
83 #ifndef ATOMIC_ADD
84 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
85 #endif
86
87 #ifdef SSE_POSSIBLE
88 #include <emmintrin.h>
89
90 #if defined(__GNUC__) && (__GNUC < 4 || __GNUC_MINOR__ < 6) && !defined(__clang__)
91         #define _mm_cvtss_f32(val) (__builtin_ia32_vec_ext_v4sf ((__v4sf)(val), 0))
92 #endif
93
94 #define MM_MALLOC(size) _mm_malloc(size, ALIGN_SIZE)
95
96 static void *MM_CALLOC(size_t nmemb, size_t size)
97 {
98         void *ptr = _mm_malloc(nmemb*size, ALIGN_SIZE);
99         if (ptr != NULL) memset(ptr, 0, nmemb*size);
100         return ptr;
101 }
102
103 #define MM_FREE _mm_free
104 #else
105 #define MM_MALLOC(size) malloc(size)
106 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
107 #define MM_FREE free
108 #endif
109
110 typedef enum DPSOFTRAST_ARRAY_e
111 {
112         DPSOFTRAST_ARRAY_POSITION,
113         DPSOFTRAST_ARRAY_COLOR,
114         DPSOFTRAST_ARRAY_TEXCOORD0,
115         DPSOFTRAST_ARRAY_TEXCOORD1,
116         DPSOFTRAST_ARRAY_TEXCOORD2,
117         DPSOFTRAST_ARRAY_TEXCOORD3,
118         DPSOFTRAST_ARRAY_TEXCOORD4,
119         DPSOFTRAST_ARRAY_TEXCOORD5,
120         DPSOFTRAST_ARRAY_TEXCOORD6,
121         DPSOFTRAST_ARRAY_TEXCOORD7,
122         DPSOFTRAST_ARRAY_TOTAL
123 }
124 DPSOFTRAST_ARRAY;
125
126 typedef struct DPSOFTRAST_Texture_s
127 {
128         int flags;
129         int width;
130         int height;
131         int depth;
132         int sides;
133         DPSOFTRAST_TEXTURE_FILTER filter;
134         int mipmaps;
135         int size;
136         ATOMIC_COUNTER binds;
137         unsigned char *bytes;
138         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
139 }
140 DPSOFTRAST_Texture;
141
142 #define COMMAND_SIZE ALIGN_SIZE
143 #define COMMAND_ALIGN(var) ALIGN(var)
144
145 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
146 {
147         unsigned char opcode;
148         unsigned short commandsize;
149 }
150 DPSOFTRAST_Command);
151
152 enum { DPSOFTRAST_OPCODE_Reset = 0 };
153
154 #define DEFCOMMAND(opcodeval, name, fields) \
155         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
156         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
157         { \
158                 unsigned char opcode; \
159                 unsigned short commandsize; \
160                 fields \
161         } DPSOFTRAST_Command_##name );
162
163 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
164 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
165
166 typedef ALIGN(struct DPSOFTRAST_State_Command_Pool_s
167 {
168         int freecommand;
169         int usedcommands;
170         ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
171 }
172 DPSOFTRAST_State_Command_Pool);
173
174 typedef ALIGN(struct DPSOFTRAST_State_Triangle_s
175 {
176         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
177         float w[3];
178         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
179 }
180 DPSOFTRAST_State_Triangle);
181
182 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
183         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
184         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
185                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
186                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
187 }
188 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
189         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
190         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
191         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
192         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
193         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
194         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
195         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
196         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
197 }
198                                         
199 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
200
201 typedef ALIGN(struct DPSOFTRAST_State_Span_s
202 {
203         int triangle; // triangle this span was generated by
204         int x; // framebuffer x coord
205         int y; // framebuffer y coord
206         int startx; // usable range (according to pixelmask)
207         int endx; // usable range (according to pixelmask)
208         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
209         int depthbase; // depthbuffer value at x (add depthslope*startx to get first pixel's depthbuffer value)
210         int depthslope; // depthbuffer value pixel delta
211 }
212 DPSOFTRAST_State_Span);
213
214 #define DPSOFTRAST_DRAW_MAXSPANS 1024
215 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
216 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
217
218 #define DPSOFTRAST_VALIDATE_FB 1
219 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
220 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
221 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
222
223 typedef enum DPSOFTRAST_BLENDMODE_e
224 {
225         DPSOFTRAST_BLENDMODE_OPAQUE,
226         DPSOFTRAST_BLENDMODE_ALPHA,
227         DPSOFTRAST_BLENDMODE_ADDALPHA,
228         DPSOFTRAST_BLENDMODE_ADD,
229         DPSOFTRAST_BLENDMODE_INVMOD,
230         DPSOFTRAST_BLENDMODE_MUL,
231         DPSOFTRAST_BLENDMODE_MUL2,
232         DPSOFTRAST_BLENDMODE_SUBALPHA,
233         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
234         DPSOFTRAST_BLENDMODE_INVADD,
235         DPSOFTRAST_BLENDMODE_TOTAL
236 }
237 DPSOFTRAST_BLENDMODE;
238
239 typedef ALIGN(struct DPSOFTRAST_State_Thread_s
240 {
241         void *thread;
242         int index;
243         
244         int cullface;
245         int colormask[4];
246         int blendfunc[2];
247         int blendsubtract;
248         int depthmask;
249         int depthtest;
250         int depthfunc;
251         int scissortest;
252         int viewport[4];
253         int scissor[4];
254         float depthrange[2];
255         float polygonoffset[2];
256         float clipplane[4];
257         ALIGN(float fb_clipplane[4]);
258
259         int shader_mode;
260         int shader_permutation;
261         int shader_exactspecularmath;
262
263         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
264         
265         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
266         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
267
268         // DPSOFTRAST_VALIDATE_ flags
269         int validate;
270
271         // derived values (DPSOFTRAST_VALIDATE_FB)
272         int fb_colormask;
273         int fb_scissor[4];
274         ALIGN(float fb_viewportcenter[4]);
275         ALIGN(float fb_viewportscale[4]);
276
277         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
278         int fb_depthfunc;
279
280         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
281         int fb_blendmode;
282
283         // band boundaries
284         int miny1;
285         int maxy1;
286         int miny2;
287         int maxy2;
288
289         ATOMIC(volatile int commandoffset);
290
291         volatile bool waiting;
292         volatile bool starving;
293         void *waitcond;
294         void *drawcond;
295         void *drawmutex;
296
297         int numspans;
298         int numtriangles;
299         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
300         DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
301         unsigned char pixelmaskarray[DPSOFTRAST_DRAW_MAXSPANLENGTH+4]; // LordHavoc: padded to allow some termination bytes
302 }
303 DPSOFTRAST_State_Thread);
304
305 typedef ALIGN(struct DPSOFTRAST_State_s
306 {
307         int fb_width;
308         int fb_height;
309         unsigned int *fb_depthpixels;
310         unsigned int *fb_colorpixels[4];
311
312         int viewport[4];
313         ALIGN(float fb_viewportcenter[4]);
314         ALIGN(float fb_viewportscale[4]);
315
316         float color[4];
317         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
318         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
319
320         const float *pointer_vertex3f;
321         const float *pointer_color4f;
322         const unsigned char *pointer_color4ub;
323         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
324         int stride_vertex;
325         int stride_color;
326         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
327         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
328         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
329
330         int firstvertex;
331         int numvertices;
332         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
333         float *screencoord4f;
334         int drawstarty;
335         int drawendy;
336         int drawclipped;
337         
338         int shader_mode;
339         int shader_permutation;
340         int shader_exactspecularmath;
341
342         int texture_max;
343         int texture_end;
344         int texture_firstfree;
345         DPSOFTRAST_Texture *texture;
346
347         int bigendian;
348
349         // error reporting
350         const char *errorstring;
351
352         bool usethreads;
353         int interlace;
354         int numthreads;
355         DPSOFTRAST_State_Thread *threads;
356
357         ATOMIC(volatile int drawcommand);
358
359         DPSOFTRAST_State_Command_Pool commandpool;
360 }
361 DPSOFTRAST_State);
362
363 DPSOFTRAST_State dpsoftrast;
364
365 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
366 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
367 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
368 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
369
370 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span);
371 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span);
372
373 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
374 {
375         fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
376         fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
377         fb_viewportcenter[3] = 0.5f;
378         fb_viewportcenter[0] = 0.0f;
379         fb_viewportscale[1] = 0.5f * viewport[2];
380         fb_viewportscale[2] = -0.5f * viewport[3];
381         fb_viewportscale[3] = 0.5f;
382         fb_viewportscale[0] = 1.0f;
383 }
384
385 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
386 {
387         if (dpsoftrast.interlace)
388         {
389                 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
390                 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
391                 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
392                 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
393         }
394         else
395         {
396                 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
397                 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
398         }
399 }
400
401 static void DPSOFTRAST_RecalcClipPlane(DPSOFTRAST_State_Thread *thread)
402 {
403         thread->fb_clipplane[0] = thread->clipplane[0] / thread->fb_viewportscale[1];
404         thread->fb_clipplane[1] = thread->clipplane[1] / thread->fb_viewportscale[2];
405         thread->fb_clipplane[2] = thread->clipplane[2] / thread->fb_viewportscale[3];
406         thread->fb_clipplane[3] = thread->clipplane[3] / thread->fb_viewportscale[0];
407         thread->fb_clipplane[3] -= thread->fb_viewportcenter[1]*thread->fb_clipplane[0] + thread->fb_viewportcenter[2]*thread->fb_clipplane[1] + thread->fb_viewportcenter[3]*thread->fb_clipplane[2] + thread->fb_viewportcenter[0]*thread->fb_clipplane[3];
408 }
409
410 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
411 {
412         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
413         // and viewport projection values
414         int x1, x2;
415         int y1, y2;
416         x1 = thread->scissor[0];
417         x2 = thread->scissor[0] + thread->scissor[2];
418         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
419         y2 = dpsoftrast.fb_height - thread->scissor[1];
420         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
421         if (x1 < 0) x1 = 0;
422         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
423         if (y1 < 0) y1 = 0;
424         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
425         thread->fb_scissor[0] = x1;
426         thread->fb_scissor[1] = y1;
427         thread->fb_scissor[2] = x2 - x1;
428         thread->fb_scissor[3] = y2 - y1;
429
430         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
431         DPSOFTRAST_RecalcClipPlane(thread);
432         DPSOFTRAST_RecalcThread(thread);
433 }
434
435 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
436 {
437         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
438 }
439
440 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
441 {
442         if (thread->blendsubtract)
443         {
444                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
445                 {
446                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
447                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
448                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
449                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
450                 }
451         }
452         else
453         {       
454                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
455                 {
456                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
457                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
458                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
459                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
460                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
461                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
462                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
463                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
464                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
465                 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
466                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
467                 }
468         }
469 }
470
471 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
472
473 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
474 {
475         mask &= thread->validate;
476         if (!mask)
477                 return;
478         if (mask & DPSOFTRAST_VALIDATE_FB)
479         {
480                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
481                 DPSOFTRAST_RecalcFB(thread);
482         }
483         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
484         {
485                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
486                 DPSOFTRAST_RecalcDepthFunc(thread);
487         }
488         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
489         {
490                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
491                 DPSOFTRAST_RecalcBlendFunc(thread);
492         }
493 }
494
495 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
496 {
497         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
498                 return &dpsoftrast.texture[index];
499         return NULL;
500 }
501
502 static void DPSOFTRAST_Texture_Grow(void)
503 {
504         DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
505         DPSOFTRAST_State_Thread *thread;
506         int i;
507         int j;
508         DPSOFTRAST_Flush();
509         // expand texture array as needed
510         if (dpsoftrast.texture_max < 1024)
511                 dpsoftrast.texture_max = 1024;
512         else
513                 dpsoftrast.texture_max *= 2;
514         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
515         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
516                 if (dpsoftrast.texbound[i])
517                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
518         for (j = 0; j < dpsoftrast.numthreads; j++)
519         {
520                 thread = &dpsoftrast.threads[j];
521                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
522                         if (thread->texbound[i])
523                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
524         }
525 }
526
527 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
528 {
529         int w;
530         int h;
531         int d;
532         int size;
533         int s;
534         int texnum;
535         int mipmaps;
536         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
537         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
538         DPSOFTRAST_Texture *texture;
539         if (width*height*depth < 1)
540         {
541                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
542                 return 0;
543         }
544         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
545         {
546                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
547                 return 0;
548         }
549         switch(texformat)
550         {
551         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
552         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
553         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
554                 break;
555         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
556                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
557                 {
558                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
559                         return 0;
560                 }
561                 if (depth != 1)
562                 {
563                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
564                         return 0;
565                 }
566                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
567                 {
568                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
569                         return 0;
570                 }
571                 break;
572         }
573         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
574         {
575                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
576                 return 0;
577         }
578         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
579         {
580                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
581                 return 0;
582         }
583         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
584         {
585                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
586                 return 0;
587         }
588         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
589         {
590                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
591                 return 0;
592         }
593         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
594         {
595                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
596                 return 0;
597         }
598         // find first empty slot in texture array
599         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
600                 if (!dpsoftrast.texture[texnum].bytes)
601                         break;
602         dpsoftrast.texture_firstfree = texnum + 1;
603         if (dpsoftrast.texture_max <= texnum)
604                 DPSOFTRAST_Texture_Grow();
605         if (dpsoftrast.texture_end <= texnum)
606                 dpsoftrast.texture_end = texnum + 1;
607         texture = &dpsoftrast.texture[texnum];
608         memset(texture, 0, sizeof(*texture));
609         texture->flags = flags;
610         texture->width = width;
611         texture->height = height;
612         texture->depth = depth;
613         texture->sides = sides;
614         texture->binds = 0;
615         w = width;
616         h = height;
617         d = depth;
618         size = 0;
619         mipmaps = 0;
620         w = width;
621         h = height;
622         d = depth;
623         for (;;)
624         {
625                 s = w * h * d * sides * 4;
626                 texture->mipmap[mipmaps][0] = size;
627                 texture->mipmap[mipmaps][1] = s;
628                 texture->mipmap[mipmaps][2] = w;
629                 texture->mipmap[mipmaps][3] = h;
630                 texture->mipmap[mipmaps][4] = d;
631                 size += s;
632                 mipmaps++;
633                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
634                         break;
635                 if (w > 1) w >>= 1;
636                 if (h > 1) h >>= 1;
637                 if (d > 1) d >>= 1;
638         }
639         texture->mipmaps = mipmaps;
640         texture->size = size;
641
642         // allocate the pixels now
643         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
644
645         return texnum;
646 }
647 void DPSOFTRAST_Texture_Free(int index)
648 {
649         DPSOFTRAST_Texture *texture;
650         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
651         if (texture->binds)
652                 DPSOFTRAST_Flush();
653         if (texture->bytes)
654                 MM_FREE(texture->bytes);
655         texture->bytes = NULL;
656         memset(texture, 0, sizeof(*texture));
657         // adjust the free range and used range
658         if (dpsoftrast.texture_firstfree > index)
659                 dpsoftrast.texture_firstfree = index;
660         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
661                 dpsoftrast.texture_end--;
662 }
663 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
664 {
665         int i, x, y, z, w, layer0, layer1, row0, row1;
666         unsigned char *o, *i0, *i1, *i2, *i3;
667         DPSOFTRAST_Texture *texture;
668         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
669         if (texture->mipmaps <= 1)
670                 return;
671         for (i = 1;i < texture->mipmaps;i++)
672         {
673                 for (z = 0;z < texture->mipmap[i][4];z++)
674                 {
675                         layer0 = z*2;
676                         layer1 = z*2+1;
677                         if (layer1 >= texture->mipmap[i-1][4])
678                                 layer1 = texture->mipmap[i-1][4]-1;
679                         for (y = 0;y < texture->mipmap[i][3];y++)
680                         {
681                                 row0 = y*2;
682                                 row1 = y*2+1;
683                                 if (row1 >= texture->mipmap[i-1][3])
684                                         row1 = texture->mipmap[i-1][3]-1;
685                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
686                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
687                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
688                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
689                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
690                                 w = texture->mipmap[i][2];
691                                 if (layer1 > layer0)
692                                 {
693                                         if (texture->mipmap[i-1][2] > 1)
694                                         {
695                                                 // average 3D texture
696                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
697                                                 {
698                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
699                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
700                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
701                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
702                                                 }
703                                         }
704                                         else
705                                         {
706                                                 // average 3D mipmap with parent width == 1
707                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
708                                                 {
709                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
710                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
711                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
712                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
713                                                 }
714                                         }
715                                 }
716                                 else
717                                 {
718                                         if (texture->mipmap[i-1][2] > 1)
719                                         {
720                                                 // average 2D texture (common case)
721                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
722                                                 {
723                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
724                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
725                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
726                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
727                                                 }
728                                         }
729                                         else
730                                         {
731                                                 // 2D texture with parent width == 1
732                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
733                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
734                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
735                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
736                                         }
737                                 }
738                         }
739                 }
740         }
741 }
742 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
743 {
744         DPSOFTRAST_Texture *texture;
745         unsigned char *dst;
746         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
747         if (texture->binds)
748                 DPSOFTRAST_Flush();
749         if (pixels)
750         {
751                 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
752                 while (blockheight > 0)
753                 {
754                         memcpy(dst, pixels, blockwidth * 4);
755                         pixels += blockwidth * 4;
756                         dst += texture->mipmap[0][2] * 4;
757                         blockheight--;
758                 }
759         }
760         DPSOFTRAST_Texture_CalculateMipmaps(index);
761 }
762 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
763 {
764         DPSOFTRAST_Texture *texture;
765         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
766         if (texture->binds)
767                 DPSOFTRAST_Flush();
768         if (pixels)
769                 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
770         DPSOFTRAST_Texture_CalculateMipmaps(index);
771 }
772 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
773 {
774         DPSOFTRAST_Texture *texture;
775         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
776         return texture->mipmap[mip][2];
777 }
778 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
779 {
780         DPSOFTRAST_Texture *texture;
781         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
782         return texture->mipmap[mip][3];
783 }
784 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
785 {
786         DPSOFTRAST_Texture *texture;
787         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
788         return texture->mipmap[mip][4];
789 }
790 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
791 {
792         DPSOFTRAST_Texture *texture;
793         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
794         if (texture->binds)
795                 DPSOFTRAST_Flush();
796         return texture->bytes + texture->mipmap[mip][0];
797 }
798 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
799 {
800         DPSOFTRAST_Texture *texture;
801         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
802         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
803         {
804                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
805                 return;
806         }
807         if (texture->binds)
808                 DPSOFTRAST_Flush();
809         texture->filter = filter;
810 }
811
812 static void DPSOFTRAST_Draw_FlushThreads(void);
813
814 static void DPSOFTRAST_Draw_SyncCommands(void)
815 {
816         if(dpsoftrast.usethreads) MEMORY_BARRIER;
817         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
818 }
819
820 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
821 {
822         DPSOFTRAST_State_Thread *thread;
823         int i;
824         int freecommand = dpsoftrast.commandpool.freecommand;
825         int usedcommands = dpsoftrast.commandpool.usedcommands;
826         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
827                 return;
828         DPSOFTRAST_Draw_SyncCommands();
829         for(;;)
830         {
831                 int waitindex = -1;
832                 int commandoffset;
833                 usedcommands = 0;
834                 for (i = 0; i < dpsoftrast.numthreads; i++)
835                 {
836                         thread = &dpsoftrast.threads[i]; 
837                         commandoffset = freecommand - thread->commandoffset;
838                         if (commandoffset < 0)
839                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
840                         if (commandoffset > usedcommands)
841                         {
842                                 waitindex = i;
843                                 usedcommands = commandoffset;
844                         }
845                 }
846                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
847                         break;
848                 thread = &dpsoftrast.threads[waitindex];
849                 Thread_LockMutex(thread->drawmutex);
850                 if (thread->commandoffset != dpsoftrast.drawcommand)
851                 {
852                         thread->waiting = true;
853                         if (thread->starving) Thread_CondSignal(thread->drawcond);
854                         Thread_CondWait(thread->waitcond, thread->drawmutex);
855                         thread->waiting = false;
856                 }
857                 Thread_UnlockMutex(thread->drawmutex);
858         }
859         dpsoftrast.commandpool.usedcommands = usedcommands;
860 }
861
862 #define DPSOFTRAST_ALIGNCOMMAND(size) \
863         ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
864 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
865         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
866
867 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
868 {
869         DPSOFTRAST_Command *command;
870         int freecommand = dpsoftrast.commandpool.freecommand;
871         int usedcommands = dpsoftrast.commandpool.usedcommands;
872         int extra = sizeof(DPSOFTRAST_Command);
873         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
874                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
875         if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
876         {
877                 if (dpsoftrast.usethreads)
878                         DPSOFTRAST_Draw_FreeCommandPool(size + extra);
879                 else
880                         DPSOFTRAST_Draw_FlushThreads();
881                 freecommand = dpsoftrast.commandpool.freecommand;
882                 usedcommands = dpsoftrast.commandpool.usedcommands;
883         }
884         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
885         {
886                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
887                 command->opcode = DPSOFTRAST_OPCODE_Reset;
888                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
889                 freecommand = 0;
890         }
891         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
892         command->opcode = opcode;
893         command->commandsize = size;
894         freecommand += size;
895         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
896                 freecommand = 0;
897         dpsoftrast.commandpool.freecommand = freecommand;
898         dpsoftrast.commandpool.usedcommands = usedcommands + size;
899         return command;
900 }
901
902 static void DPSOFTRAST_UndoCommand(int size)
903 {
904         int freecommand = dpsoftrast.commandpool.freecommand;
905         int usedcommands = dpsoftrast.commandpool.usedcommands;
906         freecommand -= size;
907         if (freecommand < 0)
908                 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
909         usedcommands -= size;
910         dpsoftrast.commandpool.freecommand = freecommand;
911         dpsoftrast.commandpool.usedcommands = usedcommands;
912 }
913                 
914 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
915 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
916 {
917         thread->viewport[0] = command->x;
918         thread->viewport[1] = command->y;
919         thread->viewport[2] = command->width;
920         thread->viewport[3] = command->height;
921         thread->validate |= DPSOFTRAST_VALIDATE_FB;
922 }
923 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
924 {
925         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
926         command->x = x;
927         command->y = y;
928         command->width = width;
929         command->height = height;
930
931         dpsoftrast.viewport[0] = x;
932         dpsoftrast.viewport[1] = y;
933         dpsoftrast.viewport[2] = width;
934         dpsoftrast.viewport[3] = height;
935         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
936 }
937
938 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
939 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
940 {
941         int i, x1, y1, x2, y2, w, h, x, y;
942         int miny1, maxy1, miny2, maxy2;
943         int bandy;
944         unsigned int *p;
945         unsigned int c;
946         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
947         miny1 = thread->miny1;
948         maxy1 = thread->maxy1;
949         miny2 = thread->miny2;
950         maxy2 = thread->maxy2;
951         x1 = thread->fb_scissor[0];
952         y1 = thread->fb_scissor[1];
953         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
954         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
955         if (y1 < miny1) y1 = miny1;
956         if (y2 > maxy2) y2 = maxy2;
957         w = x2 - x1;
958         h = y2 - y1;
959         if (w < 1 || h < 1)
960                 return;
961         // FIXME: honor fb_colormask?
962         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
963         for (i = 0;i < 4;i++)
964         {
965                 if (!dpsoftrast.fb_colorpixels[i])
966                         continue;
967                 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
968                 for (;y < bandy;y++)
969                 {
970                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
971                         for (x = x1;x < x2;x++)
972                                 p[x] = c;
973                 }
974         }
975 }
976 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
977 {
978         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
979         command->r = r;
980         command->g = g;
981         command->b = b;
982         command->a = a;
983 }
984
985 DEFCOMMAND(3, ClearDepth, float depth;)
986 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
987 {
988         int x1, y1, x2, y2, w, h, x, y;
989         int miny1, maxy1, miny2, maxy2;
990         int bandy;
991         unsigned int *p;
992         unsigned int c;
993         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
994         miny1 = thread->miny1;
995         maxy1 = thread->maxy1;
996         miny2 = thread->miny2;
997         maxy2 = thread->maxy2;
998         x1 = thread->fb_scissor[0];
999         y1 = thread->fb_scissor[1];
1000         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
1001         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
1002         if (y1 < miny1) y1 = miny1;
1003         if (y2 > maxy2) y2 = maxy2;
1004         w = x2 - x1;
1005         h = y2 - y1;
1006         if (w < 1 || h < 1)
1007                 return;
1008         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
1009         for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
1010         for (;y < bandy;y++)
1011         {
1012                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
1013                 for (x = x1;x < x2;x++)
1014                         p[x] = c;
1015         }
1016 }
1017 void DPSOFTRAST_ClearDepth(float d)
1018 {
1019         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
1020         command->depth = d;
1021 }
1022
1023 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
1024 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
1025 {
1026         thread->colormask[0] = command->r != 0;
1027         thread->colormask[1] = command->g != 0;
1028         thread->colormask[2] = command->b != 0;
1029         thread->colormask[3] = command->a != 0;
1030         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
1031 }
1032 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1033 {
1034         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1035         command->r = r;
1036         command->g = g;
1037         command->b = b;
1038         command->a = a;
1039 }
1040
1041 DEFCOMMAND(5, DepthTest, int enable;)
1042 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1043 {
1044         thread->depthtest = command->enable;
1045         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1046 }
1047 void DPSOFTRAST_DepthTest(int enable)
1048 {
1049         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1050         command->enable = enable;
1051 }
1052
1053 DEFCOMMAND(6, ScissorTest, int enable;)
1054 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1055 {
1056         thread->scissortest = command->enable;
1057         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1058 }
1059 void DPSOFTRAST_ScissorTest(int enable)
1060 {
1061         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1062         command->enable = enable;
1063 }
1064
1065 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1066 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1067 {
1068         thread->scissor[0] = command->x;
1069         thread->scissor[1] = command->y;
1070         thread->scissor[2] = command->width;
1071         thread->scissor[3] = command->height;
1072         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1073 }
1074 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1075 {
1076         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1077         command->x = x;
1078         command->y = y;
1079         command->width = width;
1080         command->height = height;
1081 }
1082
1083 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1084 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1085 {
1086         thread->blendfunc[0] = command->sfactor;
1087         thread->blendfunc[1] = command->dfactor;
1088         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1089 }
1090 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1091 {
1092         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1093         command->sfactor = sfactor;
1094         command->dfactor = dfactor;
1095 }
1096
1097 DEFCOMMAND(9, BlendSubtract, int enable;)
1098 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1099 {
1100         thread->blendsubtract = command->enable;
1101         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1102 }
1103 void DPSOFTRAST_BlendSubtract(int enable)
1104 {
1105         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1106         command->enable = enable;
1107 }
1108
1109 DEFCOMMAND(10, DepthMask, int enable;)
1110 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1111 {
1112         thread->depthmask = command->enable;
1113 }
1114 void DPSOFTRAST_DepthMask(int enable)
1115 {
1116         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1117         command->enable = enable;
1118 }
1119
1120 DEFCOMMAND(11, DepthFunc, int func;)
1121 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1122 {
1123         thread->depthfunc = command->func;
1124 }
1125 void DPSOFTRAST_DepthFunc(int func)
1126 {
1127         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1128         command->func = func;
1129 }
1130
1131 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1132 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1133 {
1134         thread->depthrange[0] = command->nearval;
1135         thread->depthrange[1] = command->farval;
1136 }
1137 void DPSOFTRAST_DepthRange(float nearval, float farval)
1138 {
1139         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1140         command->nearval = nearval;
1141         command->farval = farval;
1142 }
1143
1144 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1145 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1146 {
1147         thread->polygonoffset[0] = command->alongnormal;
1148         thread->polygonoffset[1] = command->intoview;
1149 }
1150 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1151 {
1152         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1153         command->alongnormal = alongnormal;
1154         command->intoview = intoview;
1155 }
1156
1157 DEFCOMMAND(14, CullFace, int mode;)
1158 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1159 {
1160         thread->cullface = command->mode;
1161 }
1162 void DPSOFTRAST_CullFace(int mode)
1163 {
1164         DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1165         command->mode = mode;
1166 }
1167
1168 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1169 {
1170         dpsoftrast.color[0] = r;
1171         dpsoftrast.color[1] = g;
1172         dpsoftrast.color[2] = b;
1173         dpsoftrast.color[3] = a;
1174 }
1175
1176 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1177 {
1178         int outstride = blockwidth * 4;
1179         int instride = dpsoftrast.fb_width * 4;
1180         int bx1 = blockx;
1181         int by1 = blocky;
1182         int bx2 = blockx + blockwidth;
1183         int by2 = blocky + blockheight;
1184         int bw;
1185         int x;
1186         int y;
1187         unsigned char *inpixels;
1188         unsigned char *b;
1189         unsigned char *o;
1190         DPSOFTRAST_Flush();
1191         if (bx1 < 0) bx1 = 0;
1192         if (by1 < 0) by1 = 0;
1193         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1194         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1195         bw = bx2 - bx1;
1196         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1197         if (dpsoftrast.bigendian)
1198         {
1199                 for (y = by1;y < by2;y++)
1200                 {
1201                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1202                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1203                         for (x = bx1;x < bx2;x++)
1204                         {
1205                                 o[0] = b[3];
1206                                 o[1] = b[2];
1207                                 o[2] = b[1];
1208                                 o[3] = b[0];
1209                                 o += 4;
1210                                 b += 4;
1211                         }
1212                 }
1213         }
1214         else
1215         {
1216                 for (y = by1;y < by2;y++)
1217                 {
1218                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1219                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1220                         memcpy(o, b, bw*4);
1221                 }
1222         }
1223
1224 }
1225 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1226 {
1227         int tx1 = tx;
1228         int ty1 = ty;
1229         int tx2 = tx + width;
1230         int ty2 = ty + height;
1231         int sx1 = sx;
1232         int sy1 = sy;
1233         int sx2 = sx + width;
1234         int sy2 = sy + height;
1235         int swidth;
1236         int sheight;
1237         int twidth;
1238         int theight;
1239         int sw;
1240         int sh;
1241         int tw;
1242         int th;
1243         int y;
1244         unsigned int *spixels;
1245         unsigned int *tpixels;
1246         DPSOFTRAST_Texture *texture;
1247         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1248         if (mip < 0 || mip >= texture->mipmaps) return;
1249         DPSOFTRAST_Flush();
1250         spixels = dpsoftrast.fb_colorpixels[0];
1251         swidth = dpsoftrast.fb_width;
1252         sheight = dpsoftrast.fb_height;
1253         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1254         twidth = texture->mipmap[mip][2];
1255         theight = texture->mipmap[mip][3];
1256         if (tx1 < 0) tx1 = 0;
1257         if (ty1 < 0) ty1 = 0;
1258         if (tx2 > twidth) tx2 = twidth;
1259         if (ty2 > theight) ty2 = theight;
1260         if (sx1 < 0) sx1 = 0;
1261         if (sy1 < 0) sy1 = 0;
1262         if (sx2 > swidth) sx2 = swidth;
1263         if (sy2 > sheight) sy2 = sheight;
1264         tw = tx2 - tx1;
1265         th = ty2 - ty1;
1266         sw = sx2 - sx1;
1267         sh = sy2 - sy1;
1268         if (tw > sw) tw = sw;
1269         if (th > sh) th = sh;
1270         if (tw < 1 || th < 1)
1271                 return;
1272         sy1 = sheight - 1 - sy1;
1273         for (y = 0;y < th;y++)
1274                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1275         if (texture->mipmaps > 1)
1276                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1277 }
1278
1279 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1280 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1281 {
1282         if (thread->texbound[command->unitnum])
1283                 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1284         thread->texbound[command->unitnum] = command->texture;
1285 }
1286 void DPSOFTRAST_SetTexture(int unitnum, int index)
1287 {
1288         DPSOFTRAST_Command_SetTexture *command;
1289         DPSOFTRAST_Texture *texture;
1290         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1291         {
1292                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1293                 return;
1294         }
1295         texture = DPSOFTRAST_Texture_GetByIndex(index);
1296         if (index && !texture)
1297         {
1298                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1299                 return;
1300         }
1301
1302         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1303         command->unitnum = unitnum;
1304         command->texture = texture;
1305
1306         dpsoftrast.texbound[unitnum] = texture;
1307         ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1308 }
1309
1310 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1311 {
1312         dpsoftrast.pointer_vertex3f = vertex3f;
1313         dpsoftrast.stride_vertex = stride;
1314 }
1315 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1316 {
1317         dpsoftrast.pointer_color4f = color4f;
1318         dpsoftrast.pointer_color4ub = NULL;
1319         dpsoftrast.stride_color = stride;
1320 }
1321 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1322 {
1323         dpsoftrast.pointer_color4f = NULL;
1324         dpsoftrast.pointer_color4ub = color4ub;
1325         dpsoftrast.stride_color = stride;
1326 }
1327 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1328 {
1329         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1330         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1331         dpsoftrast.stride_texcoord[unitnum] = stride;
1332 }
1333
1334 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1335 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1336 {
1337         thread->shader_mode = command->mode;
1338         thread->shader_permutation = command->permutation;
1339         thread->shader_exactspecularmath = command->exactspecularmath;
1340 }
1341 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1342 {
1343         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1344         command->mode = mode;
1345         command->permutation = permutation;
1346         command->exactspecularmath = exactspecularmath;
1347
1348         dpsoftrast.shader_mode = mode;
1349         dpsoftrast.shader_permutation = permutation;
1350         dpsoftrast.shader_exactspecularmath = exactspecularmath;
1351 }
1352
1353 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1354 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1355 {
1356         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1357 }
1358 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1359 {
1360         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1361         command->index = index;
1362         command->val[0] = v0;
1363         command->val[1] = v1;
1364         command->val[2] = v2;
1365         command->val[3] = v3;
1366
1367         dpsoftrast.uniform4f[index*4+0] = v0;
1368         dpsoftrast.uniform4f[index*4+1] = v1;
1369         dpsoftrast.uniform4f[index*4+2] = v2;
1370         dpsoftrast.uniform4f[index*4+3] = v3;
1371 }
1372 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1373 {
1374         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1375         command->index = index;
1376         memcpy(command->val, v, sizeof(command->val));
1377
1378         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1379 }
1380
1381 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1382 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1383 {
1384         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1385 }
1386 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1387 {
1388 #ifdef SSE_POSSIBLE
1389         int i, index;
1390         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1391         {
1392                 __m128 m0, m1, m2, m3;
1393                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1394                 command->index = (DPSOFTRAST_UNIFORM)index;
1395                 if (((size_t)v)&(ALIGN_SIZE-1))
1396                 {
1397                         m0 = _mm_loadu_ps(v);
1398                         m1 = _mm_loadu_ps(v+4);
1399                         m2 = _mm_loadu_ps(v+8);
1400                         m3 = _mm_loadu_ps(v+12);
1401                 }
1402                 else
1403                 {
1404                         m0 = _mm_load_ps(v);
1405                         m1 = _mm_load_ps(v+4);
1406                         m2 = _mm_load_ps(v+8);
1407                         m3 = _mm_load_ps(v+12);
1408                 }
1409                 if (transpose)
1410                 {
1411                         __m128 t0, t1, t2, t3;
1412                         t0 = _mm_unpacklo_ps(m0, m1);
1413                         t1 = _mm_unpacklo_ps(m2, m3);
1414                         t2 = _mm_unpackhi_ps(m0, m1);
1415                         t3 = _mm_unpackhi_ps(m2, m3);
1416                         m0 = _mm_movelh_ps(t0, t1);
1417                         m1 = _mm_movehl_ps(t1, t0);
1418                         m2 = _mm_movelh_ps(t2, t3);
1419                         m3 = _mm_movehl_ps(t3, t2);                     
1420                 }
1421                 _mm_store_ps(command->val, m0);
1422                 _mm_store_ps(command->val+4, m1);
1423                 _mm_store_ps(command->val+8, m2);
1424                 _mm_store_ps(command->val+12, m3);
1425                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1426                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1427                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1428                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1429         }
1430 #endif
1431 }
1432
1433 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1434 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1435 {
1436         thread->uniform1i[command->index] = command->val;
1437 }
1438 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1439 {
1440         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1441         command->index = index;
1442         command->val = i0;
1443
1444         dpsoftrast.uniform1i[command->index] = i0;
1445 }
1446
1447 DEFCOMMAND(24, ClipPlane, float clipplane[4];)
1448 static void DPSOFTRAST_Interpret_ClipPlane(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClipPlane *command)
1449 {
1450         memcpy(thread->clipplane, command->clipplane, 4*sizeof(float));
1451         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1452 }
1453 void DPSOFTRAST_ClipPlane(float x, float y, float z, float w)
1454 {
1455         DPSOFTRAST_Command_ClipPlane *command = DPSOFTRAST_ALLOCATECOMMAND(ClipPlane);
1456         command->clipplane[0] = x;
1457         command->clipplane[1] = y;
1458         command->clipplane[2] = z;
1459         command->clipplane[3] = w;
1460 }
1461
1462 #ifdef SSE_POSSIBLE
1463 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1464 {
1465         float *end = dst + size*4;
1466         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1467         {
1468                 while (dst < end)
1469                 {
1470                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1471                         dst += 4;
1472                         src += stride;
1473                 }
1474         }
1475         else
1476         {
1477                 while (dst < end)
1478                 {
1479                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1480                         dst += 4;
1481                         src += stride;
1482                 }
1483         }
1484 }
1485
1486 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1487 {
1488         float *end = dst + size*4;
1489         if (stride == sizeof(float[3]))
1490         {
1491                 float *end4 = dst + (size&~3)*4;        
1492                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1493                 {
1494                         while (dst < end4)
1495                         {
1496                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1497                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1498                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1499                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1500                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1501                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1502                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1503                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1504                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1505                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1506                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1507                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1508                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1509                                 dst += 16;
1510                                 src += 4*sizeof(float[3]);
1511                         }
1512                 }
1513                 else
1514                 {
1515                         while (dst < end4)
1516                         {
1517                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1518                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1519                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1520                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1521                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1522                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1523                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1524                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1525                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1526                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1527                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1528                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1529                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1530                                 dst += 16;
1531                                 src += 4*sizeof(float[3]);
1532                         }
1533                 }
1534         }
1535         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1536         {
1537                 while (dst < end)
1538                 {
1539                         __m128 v = _mm_loadu_ps((const float *)src);
1540                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1541                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1542                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1543                         _mm_store_ps(dst, v);
1544                         dst += 4;
1545                         src += stride;
1546                 }
1547         }
1548         else
1549         {
1550                 while (dst < end)
1551                 {
1552                         __m128 v = _mm_load_ps((const float *)src);
1553                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1554                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1555                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1556                         _mm_store_ps(dst, v);
1557                         dst += 4;
1558                         src += stride;
1559                 }
1560         }
1561 }
1562
1563 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1564 {
1565         float *end = dst + size*4;
1566         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1567         if (stride == sizeof(float[2]))
1568         {
1569                 float *end2 = dst + (size&~1)*4;
1570                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1571                 {
1572                         while (dst < end2)
1573                         {
1574                                 __m128 v = _mm_loadu_ps((const float *)src);
1575                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1576                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1577                                 dst += 8;
1578                                 src += 2*sizeof(float[2]);
1579                         }
1580                 }
1581                 else
1582                 {
1583                         while (dst < end2)
1584                         {
1585                                 __m128 v = _mm_load_ps((const float *)src);
1586                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1587                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1588                                 dst += 8;
1589                                 src += 2*sizeof(float[2]);
1590                         }
1591                 }
1592         }
1593         while (dst < end)
1594         {
1595                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1596                 dst += 4;
1597                 src += stride;
1598         }
1599 }
1600
1601 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1602 {
1603         float *end = dst + size*4;
1604         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1605         if (stride == sizeof(unsigned char[4]))
1606         {
1607                 float *end4 = dst + (size&~3)*4;
1608                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1609                 {
1610                         while (dst < end4)
1611                         {
1612                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1613                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1614                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1615                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1616                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1617                                 dst += 16;
1618                                 src += 4*sizeof(unsigned char[4]);
1619                         }
1620                 }
1621                 else
1622                 {
1623                         while (dst < end4)
1624                         {
1625                                 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1626                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1627                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1628                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1629                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1630                                 dst += 16;
1631                                 src += 4*sizeof(unsigned char[4]);
1632                         }
1633                 }
1634         }
1635         while (dst < end)
1636         {
1637                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1638                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1639                 dst += 4;
1640                 src += stride;
1641         }
1642 }
1643
1644 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1645 {
1646         float *end = dst + 4*size;
1647         __m128 v = _mm_loadu_ps(src);
1648         while (dst < end)
1649         {
1650                 _mm_store_ps(dst, v);
1651                 dst += 4;
1652         }
1653 }
1654 #endif
1655
1656 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1657 {
1658 #ifdef SSE_POSSIBLE
1659         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1660         __m128 m0, m1, m2, m3;
1661         float *end;
1662         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1663         {
1664                 // fast case for identity matrix
1665                 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1666                 return;
1667         }
1668         end = out4f + numitems*4;
1669         m0 = _mm_loadu_ps(inmatrix16f);
1670         m1 = _mm_loadu_ps(inmatrix16f + 4);
1671         m2 = _mm_loadu_ps(inmatrix16f + 8);
1672         m3 = _mm_loadu_ps(inmatrix16f + 12);
1673         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1674         {
1675                 while (out4f < end)
1676                 {
1677                         __m128 v = _mm_loadu_ps(in4f);
1678                         _mm_store_ps(out4f,
1679                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1680                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1681                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1682                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1683                         out4f += 4;
1684                         in4f += 4;
1685                 }
1686         }
1687         else
1688         {
1689                 while (out4f < end)
1690                 {
1691                         __m128 v = _mm_load_ps(in4f);
1692                         _mm_store_ps(out4f,
1693                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1694                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1695                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1696                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1697                         out4f += 4;
1698                         in4f += 4;
1699                 }
1700         }
1701 #endif
1702 }
1703
1704 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1705 {
1706         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1707 }
1708
1709 #ifdef SSE_POSSIBLE
1710 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1711 { \
1712         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1713         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1714         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1715         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1716 }
1717
1718 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1719 { \
1720         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1721         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1722         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1723         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1724 }
1725
1726 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1727 { \
1728         __m128 p = (in); \
1729         out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1730                                                   _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1731                                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1732                                                                                         _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1733 }
1734
1735 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1736 {
1737         int clipmask = 0xFF;
1738         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1739         __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1740         __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1741         __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1742         m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1743         m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1744         m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1745         m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1746         #define BBFRONT(k, pos) \
1747         { \
1748                 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1749                 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1750                 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1751                 { \
1752                         __m128 proj; \
1753                         clipmask &= ~(1<<k); \
1754                         proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1755                         minproj = _mm_min_ss(minproj, proj); \
1756                         maxproj = _mm_max_ss(maxproj, proj); \
1757                 } \
1758         }
1759         BBFRONT(0, minpos); 
1760         BBFRONT(1, _mm_move_ss(minpos, maxpos)); 
1761         BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1762         BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1763         BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1764         BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1765         BBFRONT(6, _mm_move_ss(maxpos, minpos)); 
1766         BBFRONT(7, maxpos);
1767         #define BBCLIP(k) \
1768         { \
1769                 if (clipmask&(1<<k)) \
1770                 { \
1771                         if (!(clipmask&(1<<(k^1)))) \
1772                         { \
1773                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1774                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1775                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1776                                 minproj = _mm_min_ss(minproj, proj); \
1777                                 maxproj = _mm_max_ss(maxproj, proj); \
1778                         } \
1779                         if (!(clipmask&(1<<(k^2)))) \
1780                         { \
1781                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1782                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1783                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1784                                 minproj = _mm_min_ss(minproj, proj); \
1785                                 maxproj = _mm_max_ss(maxproj, proj); \
1786                         } \
1787                         if (!(clipmask&(1<<(k^4)))) \
1788                         { \
1789                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1790                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1791                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1792                                 minproj = _mm_min_ss(minproj, proj); \
1793                                 maxproj = _mm_max_ss(maxproj, proj); \
1794                         } \
1795                 } \
1796         }
1797         BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1798         viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1799         viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1800         minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1801         maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1802         minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1803         maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1804         *starty = _mm_cvttss_si32(maxproj);
1805         *endy = _mm_cvttss_si32(minproj)+1;
1806         return clipmask;
1807 }
1808         
1809 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1810 {
1811         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1812         float *end = out4f + numitems*4;
1813         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1814         __m128 minpos, maxpos;
1815         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1816         {
1817                 minpos = maxpos = _mm_loadu_ps(in4f);
1818                 while (out4f < end)
1819                 {
1820                         __m128 v = _mm_loadu_ps(in4f);
1821                         minpos = _mm_min_ps(minpos, v);
1822                         maxpos = _mm_max_ps(maxpos, v);
1823                         _mm_store_ps(out4f, v);
1824                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1825                         _mm_store_ps(screen4f, v);
1826                         in4f += 4;
1827                         out4f += 4;
1828                         screen4f += 4;
1829                 }
1830         }
1831         else
1832         {
1833                 minpos = maxpos = _mm_load_ps(in4f);
1834                 while (out4f < end)
1835                 {
1836                         __m128 v = _mm_load_ps(in4f);
1837                         minpos = _mm_min_ps(minpos, v);
1838                         maxpos = _mm_max_ps(maxpos, v);
1839                         _mm_store_ps(out4f, v);
1840                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1841                         _mm_store_ps(screen4f, v);
1842                         in4f += 4;
1843                         out4f += 4;
1844                         screen4f += 4;
1845                 }
1846         }
1847         if (starty && endy) 
1848         {
1849                 ALIGN(float minposf[4]);
1850                 ALIGN(float maxposf[4]);
1851                 _mm_store_ps(minposf, minpos);
1852                 _mm_store_ps(maxposf, maxpos);
1853                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1854         }
1855         return 0;
1856 }
1857
1858 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1859 {
1860         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1861         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1862         float *end;
1863         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1864                 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1865         end = out4f + numitems*4;
1866         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1867         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1868         m0 = _mm_loadu_ps(inmatrix16f);
1869         m1 = _mm_loadu_ps(inmatrix16f + 4);
1870         m2 = _mm_loadu_ps(inmatrix16f + 8);
1871         m3 = _mm_loadu_ps(inmatrix16f + 12);
1872         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1873         {
1874                 minpos = maxpos = _mm_loadu_ps(in4f);
1875                 while (out4f < end)
1876                 {
1877                         __m128 v = _mm_loadu_ps(in4f);
1878                         minpos = _mm_min_ps(minpos, v);
1879                         maxpos = _mm_max_ps(maxpos, v);
1880                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1881                         _mm_store_ps(out4f, v);
1882                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1883                         _mm_store_ps(screen4f, v);
1884                         in4f += 4;
1885                         out4f += 4;
1886                         screen4f += 4;
1887                 }
1888         }
1889         else
1890         {
1891                 minpos = maxpos = _mm_load_ps(in4f);
1892                 while (out4f < end)
1893                 {
1894                         __m128 v = _mm_load_ps(in4f);
1895                         minpos = _mm_min_ps(minpos, v);
1896                         maxpos = _mm_max_ps(maxpos, v);
1897                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1898                         _mm_store_ps(out4f, v);
1899                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1900                         _mm_store_ps(screen4f, v);
1901                         in4f += 4;
1902                         out4f += 4;
1903                         screen4f += 4;
1904                 }
1905         }
1906         if (starty && endy) 
1907         {
1908                 ALIGN(float minposf[4]);
1909                 ALIGN(float maxposf[4]);
1910                 _mm_store_ps(minposf, minpos);
1911                 _mm_store_ps(maxposf, maxpos);
1912                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f); 
1913         }
1914         return 0;
1915 }
1916 #endif
1917
1918 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1919 {
1920 #ifdef SSE_POSSIBLE
1921         float *outf = dpsoftrast.post_array4f[outarray];
1922         const unsigned char *inb;
1923         int firstvertex = dpsoftrast.firstvertex;
1924         int numvertices = dpsoftrast.numvertices;
1925         int stride;
1926         switch(inarray)
1927         {
1928         case DPSOFTRAST_ARRAY_POSITION:
1929                 stride = dpsoftrast.stride_vertex;
1930                 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1931                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1932                 break;
1933         case DPSOFTRAST_ARRAY_COLOR:
1934                 stride = dpsoftrast.stride_color;
1935                 if (dpsoftrast.pointer_color4f)
1936                 {
1937                         inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1938                         DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1939                 }
1940                 else if (dpsoftrast.pointer_color4ub)
1941                 {
1942                         stride = dpsoftrast.stride_color;
1943                         inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1944                         DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1945                 }
1946                 else
1947                 {
1948                         DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1949                 }
1950                 break;
1951         default:
1952                 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1953                 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1954                 {
1955                         inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1956                         switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1957                         {
1958                         case 2:
1959                                 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1960                                 break;
1961                         case 3:
1962                                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1963                                 break;
1964                         case 4:
1965                                 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1966                                 break;
1967                         }
1968                 }
1969                 break;
1970         }
1971         return outf;
1972 #else
1973         return NULL;
1974 #endif
1975 }
1976
1977 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1978 {
1979         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1980         DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1981         return data;
1982 }
1983
1984 #if 0
1985 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1986 {
1987 #ifdef SSE_POSSIBLE
1988         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1989         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1990         return data;
1991 #else
1992         return NULL;
1993 #endif
1994 }
1995 #endif
1996
1997 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1998 {
1999 #ifdef SSE_POSSIBLE
2000         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2001         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
2002         return data;
2003 #else
2004         return NULL;
2005 #endif
2006 }
2007
2008 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
2009 {
2010         int x;
2011         int startx = span->startx;
2012         int endx = span->endx;
2013         float wslope = triangle->w[0];
2014         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
2015         float endz = 1.0f / (w + wslope * startx);
2016         if (triangle->w[0] == 0)
2017         {
2018                 // LordHavoc: fast flat polygons (HUD/menu)
2019                 for (x = startx;x < endx;x++)
2020                         zf[x] = endz;
2021                 return;
2022         }
2023         for (x = startx;x < endx;)
2024         {
2025                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2026                 float z = endz, dz;
2027                 if (nextsub >= endx) nextsub = endsub = endx-1;
2028                 endz = 1.0f / (w + wslope * nextsub);
2029                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
2030                 for (; x <= endsub; x++, z += dz)
2031                         zf[x] = z;
2032         }
2033 }
2034
2035 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2036 {
2037 #ifdef SSE_POSSIBLE
2038         int x;
2039         int startx = span->startx;
2040         int endx = span->endx;
2041         int maskx;
2042         int subx;
2043         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2044         unsigned char * RESTRICT pixelmask = span->pixelmask;
2045         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2046         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2047         if (!pixel)
2048                 return;
2049         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2050         pixeli += span->y * dpsoftrast.fb_width + span->x;
2051         // handle alphatest now (this affects depth writes too)
2052         if (thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL)
2053                 for (x = startx;x < endx;x++)
2054                         if (in4ub[x*4+3] < 128)
2055                                 pixelmask[x] = false;
2056         // LordHavoc: clear pixelmask for some pixels in alphablend cases, this
2057         // helps sprites, text and hud artwork
2058         switch(thread->fb_blendmode)
2059         {
2060         case DPSOFTRAST_BLENDMODE_ALPHA:
2061         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2062         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2063                 maskx = startx;
2064                 for (x = startx;x < endx;x++)
2065                 {
2066                         if (in4ub[x*4+3] >= 1)
2067                         {
2068                                 startx = x;
2069                                 for (;;)
2070                                 {
2071                                         while (++x < endx && in4ub[x*4+3] >= 1) ;
2072                                         maskx = x;
2073                                         if (x >= endx) break;
2074                                         ++x;
2075                                         while (++x < endx && in4ub[x*4+3] < 1) pixelmask[x] = false;
2076                                         if (x >= endx) break;
2077                                 }
2078                                 break;
2079                         }
2080                 }
2081                 endx = maskx;
2082                 break;
2083         case DPSOFTRAST_BLENDMODE_OPAQUE:
2084         case DPSOFTRAST_BLENDMODE_ADD:
2085         case DPSOFTRAST_BLENDMODE_INVMOD:
2086         case DPSOFTRAST_BLENDMODE_MUL:
2087         case DPSOFTRAST_BLENDMODE_MUL2:
2088         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2089         case DPSOFTRAST_BLENDMODE_INVADD:
2090                 break;
2091         }
2092         // put some special values at the end of the mask to ensure the loops end
2093         pixelmask[endx] = 1;
2094         pixelmask[endx+1] = 0;
2095         // LordHavoc: use a double loop to identify subspans, this helps the
2096         // optimized copy/blend loops to perform at their best, most triangles
2097         // have only one run of pixels, and do the search using wide reads...
2098         x = startx;
2099         while (x < endx)
2100         {
2101                 // if this pixel is masked off, it's probably not alone...
2102                 if (!pixelmask[x])
2103                 {
2104                         x++;
2105 #if 1
2106                         if (x + 8 < endx)
2107                         {
2108                                 // the 4-item search must be aligned or else it stalls badly
2109                                 if ((x & 3) && !pixelmask[x]) 
2110                                 {
2111                                         if(pixelmask[x]) goto endmasked;
2112                                         x++;
2113                                         if (x & 3)
2114                                         {
2115                                                 if(pixelmask[x]) goto endmasked;
2116                                                 x++;
2117                                                 if (x & 3)
2118                                                 {
2119                                                         if(pixelmask[x]) goto endmasked;
2120                                                         x++;
2121                                                 }
2122                                         }
2123                                 }
2124                                 while (*(unsigned int *)&pixelmask[x] == 0x00000000)
2125                                         x += 4;
2126                         }
2127 #endif
2128                         for (;!pixelmask[x];x++)
2129                                 ;
2130                         // rather than continue the loop, just check the end variable
2131                         if (x >= endx)
2132                                 break;
2133                 }
2134         endmasked:
2135                 // find length of subspan
2136                 subx = x + 1;
2137 #if 1
2138                 if (subx + 8 < endx)
2139                 {
2140                         if (subx & 3)
2141                         {
2142                                 if(!pixelmask[subx]) goto endunmasked;
2143                                 subx++;
2144                                 if (subx & 3)
2145                                 {
2146                                         if(!pixelmask[subx]) goto endunmasked;
2147                                         subx++;
2148                                         if (subx & 3)
2149                                         {
2150                                                 if(!pixelmask[subx]) goto endunmasked;
2151                                                 subx++;
2152                                         }
2153                                 }
2154                         }
2155                         while (*(unsigned int *)&pixelmask[subx] == 0x01010101)
2156                                 subx += 4;
2157                 }
2158 #endif
2159                 for (;pixelmask[subx];subx++)
2160                         ;
2161                 // the checks can overshoot, so make sure to clip it...
2162                 if (subx > endx)
2163                         subx = endx;
2164         endunmasked:
2165                 // now that we know the subspan length...  process!
2166                 switch(thread->fb_blendmode)
2167                 {
2168                 case DPSOFTRAST_BLENDMODE_OPAQUE:
2169 #if 0
2170                         if (subx - x >= 16)
2171                         {
2172                                 memcpy(pixeli + x, ini + x, (subx - x) * sizeof(pixeli[x]));
2173                                 x = subx;
2174                         }
2175                         else
2176 #elif 1
2177                         while (x + 16 <= subx)
2178                         {
2179                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2180                                 _mm_storeu_si128((__m128i *)&pixeli[x+4], _mm_loadu_si128((const __m128i *)&ini[x+4]));
2181                                 _mm_storeu_si128((__m128i *)&pixeli[x+8], _mm_loadu_si128((const __m128i *)&ini[x+8]));
2182                                 _mm_storeu_si128((__m128i *)&pixeli[x+12], _mm_loadu_si128((const __m128i *)&ini[x+12]));
2183                                 x += 16;
2184                         }
2185 #endif
2186                         {
2187                                 while (x + 4 <= subx)
2188                                 {
2189                                         _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2190                                         x += 4;
2191                                 }
2192                                 if (x + 2 <= subx)
2193                                 {
2194                                         pixeli[x] = ini[x];
2195                                         pixeli[x+1] = ini[x+1];
2196                                         x += 2;
2197                                 }
2198                                 if (x < subx)
2199                                 {
2200                                         pixeli[x] = ini[x];
2201                                         x++;
2202                                 }
2203                         }
2204                         break;
2205                 case DPSOFTRAST_BLENDMODE_ALPHA:
2206                 #define FINISHBLEND(blend2, blend1) \
2207                         for (;x + 1 < subx;x += 2) \
2208                         { \
2209                                 __m128i src, dst; \
2210                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2211                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2212                                 blend2; \
2213                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2214                         } \
2215                         if (x < subx) \
2216                         { \
2217                                 __m128i src, dst; \
2218                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2219                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2220                                 blend1; \
2221                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2222                                 x++; \
2223                         }
2224                         FINISHBLEND({
2225                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2226                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2227                         }, {
2228                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2229                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2230                         });
2231                         break;
2232                 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2233                         FINISHBLEND({
2234                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2235                                 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2236                         }, {
2237                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2238                                 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2239                         });
2240                         break;
2241                 case DPSOFTRAST_BLENDMODE_ADD:
2242                         FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2243                         break;
2244                 case DPSOFTRAST_BLENDMODE_INVMOD:
2245                         FINISHBLEND({
2246                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2247                         }, {
2248                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2249                         });
2250                         break;
2251                 case DPSOFTRAST_BLENDMODE_MUL:
2252                         FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2253                         break;
2254                 case DPSOFTRAST_BLENDMODE_MUL2:
2255                         FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2256                         break;
2257                 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2258                         FINISHBLEND({
2259                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2260                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2261                         }, {
2262                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2263                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2264                         });
2265                         break;
2266                 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2267                         FINISHBLEND({
2268                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2269                                 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2270                         }, {
2271                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2272                                 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2273                         });
2274                         break;
2275                 case DPSOFTRAST_BLENDMODE_INVADD:
2276                         FINISHBLEND({
2277                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2278                         }, {
2279                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2280                         });
2281                         break;
2282                 }
2283         }
2284 #endif
2285 }
2286
2287 static void DPSOFTRAST_Texture2DBGRA8(DPSOFTRAST_Texture *texture, int mip, float x, float y, unsigned char c[4])
2288         // warning: this is SLOW, only use if the optimized per-span functions won't do
2289 {
2290         const unsigned char * RESTRICT pixelbase;
2291         const unsigned char * RESTRICT pixel[4];
2292         int width = texture->mipmap[mip][2], height = texture->mipmap[mip][3];
2293         int wrapmask[2] = { width-1, height-1 };
2294         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2295         if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
2296         {
2297                 unsigned int tc[2] = { x * (width<<12) - 2048, y * (height<<12) - 2048};
2298                 unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
2299                 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2300                 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2301                 int tci[2] = { tc[0]>>12, tc[1]>>12 };
2302                 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
2303                 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2304                 {
2305                         tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2306                         tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2307                         tci1[0] = tci1[0] >= 0 ? (tci1[0] <= wrapmask[0] ? tci1[0] : wrapmask[0]) : 0;
2308                         tci1[1] = tci1[1] >= 0 ? (tci1[1] <= wrapmask[1] ? tci1[1] : wrapmask[1]) : 0;
2309                 }
2310                 else
2311                 {
2312                         tci[0] &= wrapmask[0];
2313                         tci[1] &= wrapmask[1];
2314                         tci1[0] &= wrapmask[0];
2315                         tci1[1] &= wrapmask[1];
2316                 }
2317                 pixel[0] = pixelbase + 4 * (tci[1]*width+tci[0]);
2318                 pixel[1] = pixelbase + 4 * (tci[1]*width+tci1[0]);
2319                 pixel[2] = pixelbase + 4 * (tci1[1]*width+tci[0]);
2320                 pixel[3] = pixelbase + 4 * (tci1[1]*width+tci1[0]);
2321                 c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
2322                 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
2323                 c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
2324                 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3])>>24;
2325         }
2326         else
2327         {
2328                 int tci[2] = { x * width, y * height };
2329                 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2330                 {
2331                         tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2332                         tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2333                 }
2334                 else
2335                 {
2336                         tci[0] &= wrapmask[0];
2337                         tci[1] &= wrapmask[1];
2338                 }
2339                 pixel[0] = pixelbase + 4 * (tci[1]*width+tci[0]);
2340                 c[0] = pixel[0][0];
2341                 c[1] = pixel[0][1];
2342                 c[2] = pixel[0][2];
2343                 c[3] = pixel[0][3];
2344         }
2345 }
2346
2347 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2348 {
2349         int x;
2350         int startx = span->startx;
2351         int endx = span->endx;
2352         int flags;
2353         float c[4];
2354         float data[4];
2355         float slope[4];
2356         float tc[2], endtc[2];
2357         float tcscale[2];
2358         unsigned int tci[2];
2359         unsigned int tci1[2];
2360         unsigned int tcimin[2];
2361         unsigned int tcimax[2];
2362         int tciwrapmask[2];
2363         int tciwidth;
2364         int filter;
2365         int mip;
2366         const unsigned char * RESTRICT pixelbase;
2367         const unsigned char * RESTRICT pixel[4];
2368         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2369         // if no texture is bound, just fill it with white
2370         if (!texture)
2371         {
2372                 for (x = startx;x < endx;x++)
2373                 {
2374                         out4f[x*4+0] = 1.0f;
2375                         out4f[x*4+1] = 1.0f;
2376                         out4f[x*4+2] = 1.0f;
2377                         out4f[x*4+3] = 1.0f;
2378                 }
2379                 return;
2380         }
2381         mip = triangle->mip[texunitindex];
2382         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2383         // if this mipmap of the texture is 1 pixel, just fill it with that color
2384         if (texture->mipmap[mip][1] == 4)
2385         {
2386                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2387                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2388                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2389                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2390                 for (x = startx;x < endx;x++)
2391                 {
2392                         out4f[x*4+0] = c[0];
2393                         out4f[x*4+1] = c[1];
2394                         out4f[x*4+2] = c[2];
2395                         out4f[x*4+3] = c[3];
2396                 }
2397                 return;
2398         }
2399         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2400         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2401         flags = texture->flags;
2402         tcscale[0] = texture->mipmap[mip][2];
2403         tcscale[1] = texture->mipmap[mip][3];
2404         tciwidth = texture->mipmap[mip][2];
2405         tcimin[0] = 0;
2406         tcimin[1] = 0;
2407         tcimax[0] = texture->mipmap[mip][2]-1;
2408         tcimax[1] = texture->mipmap[mip][3]-1;
2409         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2410         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2411         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
2412         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
2413         if (filter)
2414         {
2415                 endtc[0] -= 0.5f;
2416                 endtc[1] -= 0.5f;
2417         }
2418         for (x = startx;x < endx;)
2419         {
2420                 unsigned int subtc[2];
2421                 unsigned int substep[2];
2422                 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2423                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2424                 if (nextsub >= endx)
2425                 {
2426                         nextsub = endsub = endx-1;      
2427                         if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2428                 }
2429                 tc[0] = endtc[0];
2430                 tc[1] = endtc[1];
2431                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2432                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2433                 if (filter)
2434                 {
2435                         endtc[0] -= 0.5f;
2436                         endtc[1] -= 0.5f;
2437                 }
2438                 substep[0] = (endtc[0] - tc[0]) * subscale;
2439                 substep[1] = (endtc[1] - tc[1]) * subscale;
2440                 subtc[0] = tc[0] * (1<<12);
2441                 subtc[1] = tc[1] * (1<<12);
2442                 if (filter)
2443                 {
2444                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2445                         {
2446                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2447                                 {
2448                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2449                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2450                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2451                                         tci[0] = subtc[0]>>12;
2452                                         tci[1] = subtc[1]>>12;
2453                                         tci1[0] = tci[0] + 1;
2454                                         tci1[1] = tci[1] + 1;
2455                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2456                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2457                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2458                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2459                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2460                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2461                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2462                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2463                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2464                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2465                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2466                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2467                                         out4f[x*4+0] = c[0];
2468                                         out4f[x*4+1] = c[1];
2469                                         out4f[x*4+2] = c[2];
2470                                         out4f[x*4+3] = c[3];
2471                                 }
2472                         }
2473                         else
2474                         {
2475                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2476                                 {
2477                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2478                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2479                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2480                                         tci[0] = subtc[0]>>12;
2481                                         tci[1] = subtc[1]>>12;
2482                                         tci1[0] = tci[0] + 1;
2483                                         tci1[1] = tci[1] + 1;
2484                                         tci[0] &= tciwrapmask[0];
2485                                         tci[1] &= tciwrapmask[1];
2486                                         tci1[0] &= tciwrapmask[0];
2487                                         tci1[1] &= tciwrapmask[1];
2488                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2489                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2490                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2491                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2492                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2493                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2494                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2495                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2496                                         out4f[x*4+0] = c[0];
2497                                         out4f[x*4+1] = c[1];
2498                                         out4f[x*4+2] = c[2];
2499                                         out4f[x*4+3] = c[3];
2500                                 }
2501                         }
2502                 }
2503                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2504                 {
2505                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2506                         {
2507                                 tci[0] = subtc[0]>>12;
2508                                 tci[1] = subtc[1]>>12;
2509                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2510                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2511                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2512                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2513                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2514                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2515                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2516                                 out4f[x*4+0] = c[0];
2517                                 out4f[x*4+1] = c[1];
2518                                 out4f[x*4+2] = c[2];
2519                                 out4f[x*4+3] = c[3];
2520                         }
2521                 }
2522                 else
2523                 {
2524                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2525                         {
2526                                 tci[0] = subtc[0]>>12;
2527                                 tci[1] = subtc[1]>>12;
2528                                 tci[0] &= tciwrapmask[0];
2529                                 tci[1] &= tciwrapmask[1];
2530                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2531                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2532                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2533                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2534                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2535                                 out4f[x*4+0] = c[0];
2536                                 out4f[x*4+1] = c[1];
2537                                 out4f[x*4+2] = c[2];
2538                                 out4f[x*4+3] = c[3];
2539                         }
2540                 }
2541         }
2542 }
2543
2544 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2545 {
2546 #ifdef SSE_POSSIBLE
2547         int x;
2548         int startx = span->startx;
2549         int endx = span->endx;
2550         int flags;
2551         __m128 data, slope, tcscale;
2552         __m128i tcsize, tcmask, tcoffset, tcmax;
2553         __m128 tc, endtc;
2554         __m128i subtc, substep, endsubtc;
2555         int filter;
2556         int mip;
2557         int affine; // LordHavoc: optimized affine texturing case
2558         unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2559         const unsigned char * RESTRICT pixelbase;
2560         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2561         // if no texture is bound, just fill it with white
2562         if (!texture)
2563         {
2564                 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2565                 return;
2566         }
2567         mip = triangle->mip[texunitindex];
2568         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2569         // if this mipmap of the texture is 1 pixel, just fill it with that color
2570         if (texture->mipmap[mip][1] == 4)
2571         {
2572                 unsigned int k = *((const unsigned int *)pixelbase);
2573                 for (x = startx;x < endx;x++)
2574                         outi[x] = k;
2575                 return;
2576         }
2577         affine = zf[startx] == zf[endx-1];
2578         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2579         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2580         flags = texture->flags;
2581         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2582         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2583         tcscale = _mm_cvtepi32_ps(tcsize);
2584         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2585         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2586         endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2587         if (filter)
2588                 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2589         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2590         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2591         tcmax = _mm_packs_epi32(tcmask, tcmask);
2592         for (x = startx;x < endx;)
2593         {
2594                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2595                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2596                 if (nextsub >= endx || affine)
2597                 {
2598                         nextsub = endsub = endx-1;
2599                         if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2600                 }       
2601                 tc = endtc;
2602                 subtc = endsubtc;
2603                 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2604                 if (filter)
2605                         endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2606                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2607                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2608                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2609                 substep = _mm_slli_epi32(substep, 1);
2610                 if (filter)
2611                 {
2612                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2613                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2614                         {
2615                                 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2616                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2617                                 {
2618                                         const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;                   
2619                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2620                                         tci = _mm_madd_epi16(tci, tcoffset);
2621                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2622                                         ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2623                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2624                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2625                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2626                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2627                                         fracm = _mm_srli_epi16(subtc, 1);
2628                                         pix1 = _mm_add_epi16(pix1,
2629                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2630                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2631                                         pix3 = _mm_add_epi16(pix3,
2632                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2633                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2634                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2635                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2636                                         pix2 = _mm_add_epi16(pix2,
2637                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2638                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2639                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2640                                 }
2641                                 if (x <= endsub)
2642                                 {
2643                                         const unsigned char * RESTRICT ptr1;
2644                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2645                                         tci = _mm_madd_epi16(tci, tcoffset);
2646                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2647                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2648                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2649                                         fracm = _mm_srli_epi16(subtc, 1);
2650                                         pix1 = _mm_add_epi16(pix1,
2651                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2652                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2653                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2654                                         pix1 = _mm_add_epi16(pix1,
2655                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2656                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2657                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2658                                         x++;
2659                                 }
2660                         }
2661                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2662                         {
2663                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2664                                 {
2665                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2666                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2667                                         tci = _mm_madd_epi16(tci, tcoffset);
2668                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2669                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2670                                                                                         _mm_setzero_si128());
2671                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2672                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2673                                                                                         _mm_setzero_si128());
2674                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2675                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2676                                         tci = _mm_madd_epi16(tci, tcoffset);
2677                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2678                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2679                                                                                         _mm_setzero_si128());
2680                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2681                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2682                                                                                         _mm_setzero_si128());
2683                                         fracm = _mm_srli_epi16(subtc, 1);
2684                                         pix1 = _mm_add_epi16(pix1,
2685                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2686                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2687                                         pix3 = _mm_add_epi16(pix3,
2688                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2689                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2690                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2691                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2692                                         pix2 = _mm_add_epi16(pix2,
2693                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2694                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2695                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2696                                 }
2697                                 if (x <= endsub)
2698                                 {
2699                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2700                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2701                                         tci = _mm_madd_epi16(tci, tcoffset);
2702                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2703                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2704                                                                                         _mm_setzero_si128());
2705                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2706                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2707                                                                                         _mm_setzero_si128());
2708                                         fracm = _mm_srli_epi16(subtc, 1);
2709                                         pix1 = _mm_add_epi16(pix1,
2710                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2711                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2712                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2713                                         pix1 = _mm_add_epi16(pix1,
2714                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2715                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2716                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2717                                         x++;
2718                                 }
2719                         }
2720                         else
2721                         {
2722                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2723                                 {
2724                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2725                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2726                                         tci = _mm_madd_epi16(tci, tcoffset);
2727                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2728                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2729                                                                                         _mm_setzero_si128());
2730                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2731                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2732                                                                                         _mm_setzero_si128());
2733                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2734                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2735                                         tci = _mm_madd_epi16(tci, tcoffset);
2736                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2737                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2738                                                                                         _mm_setzero_si128());
2739                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2740                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2741                                                                                         _mm_setzero_si128());
2742                                         fracm = _mm_srli_epi16(subtc, 1);
2743                                         pix1 = _mm_add_epi16(pix1,
2744                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2745                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2746                                         pix3 = _mm_add_epi16(pix3,
2747                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2748                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2749                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2750                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2751                                         pix2 = _mm_add_epi16(pix2,
2752                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2753                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2754                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2755                                 }
2756                                 if (x <= endsub)
2757                                 {
2758                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2759                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2760                                         tci = _mm_madd_epi16(tci, tcoffset);
2761                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2762                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2763                                                                                         _mm_setzero_si128());
2764                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2765                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2766                                                                                         _mm_setzero_si128());
2767                                         fracm = _mm_srli_epi16(subtc, 1);
2768                                         pix1 = _mm_add_epi16(pix1,
2769                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2770                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2771                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2772                                         pix1 = _mm_add_epi16(pix1,
2773                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2774                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2775                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2776                                         x++;
2777                                 }
2778                         }
2779                 }
2780                 else
2781                 {
2782                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2783                         {
2784                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2785                                 {
2786                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2787                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2788                                         tci = _mm_madd_epi16(tci, tcoffset);
2789                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2790                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2791                                 }
2792                                 if (x <= endsub)
2793                                 {
2794                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2795                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2796                                         tci = _mm_madd_epi16(tci, tcoffset);
2797                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2798                                         x++;
2799                                 }
2800                         }
2801                         else
2802                         {
2803                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2804                                 {
2805                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2806                                         tci = _mm_and_si128(tci, tcmax); 
2807                                         tci = _mm_madd_epi16(tci, tcoffset);
2808                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2809                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2810                                 }
2811                                 if (x <= endsub)
2812                                 {
2813                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2814                                         tci = _mm_and_si128(tci, tcmax); 
2815                                         tci = _mm_madd_epi16(tci, tcoffset);
2816                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2817                                         x++;
2818                                 }
2819                         }
2820                 }
2821         }
2822 #endif
2823 }
2824
2825 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2826 {
2827         // TODO: IMPLEMENT
2828         memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2829 }
2830
2831 float DPSOFTRAST_SampleShadowmap(const float *vector)
2832 {
2833         // TODO: IMPLEMENT
2834         return 1.0f;
2835 }
2836
2837 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2838 {
2839         int x;
2840         int startx = span->startx;
2841         int endx = span->endx;
2842         float c[4];
2843         float data[4];
2844         float slope[4];
2845         float z;
2846         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2847         for (x = startx;x < endx;x++)
2848         {
2849                 z = zf[x];
2850                 c[0] = (data[0] + slope[0]*x) * z;
2851                 c[1] = (data[1] + slope[1]*x) * z;
2852                 c[2] = (data[2] + slope[2]*x) * z;
2853                 c[3] = (data[3] + slope[3]*x) * z;
2854                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2855                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2856                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2857                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2858         }
2859 }
2860
2861 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2862 {
2863         int x;
2864         int startx = span->startx;
2865         int endx = span->endx;
2866         float c[4];
2867         float data[4];
2868         float slope[4];
2869         float z;
2870         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2871         for (x = startx;x < endx;x++)
2872         {
2873                 z = zf[x];
2874                 c[0] = (data[0] + slope[0]*x) * z;
2875                 c[1] = (data[1] + slope[1]*x) * z;
2876                 c[2] = (data[2] + slope[2]*x) * z;
2877                 c[3] = (data[3] + slope[3]*x) * z;
2878                 out4f[x*4+0] = c[0];
2879                 out4f[x*4+1] = c[1];
2880                 out4f[x*4+2] = c[2];
2881                 out4f[x*4+3] = c[3];
2882         }
2883 }
2884
2885 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2886 {
2887         int x, startx = span->startx, endx = span->endx;
2888         float c[4], localcolor[4];
2889         localcolor[0] = subcolor[0];
2890         localcolor[1] = subcolor[1];
2891         localcolor[2] = subcolor[2];
2892         localcolor[3] = subcolor[3];
2893         for (x = startx;x < endx;x++)
2894         {
2895                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2896                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2897                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2898                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2899                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2900                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2901                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2902                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2903         }
2904 }
2905
2906 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2907 {
2908         int x, startx = span->startx, endx = span->endx;
2909         for (x = startx;x < endx;x++)
2910         {
2911                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2912                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2913                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2914                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2915         }
2916 }
2917
2918 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2919 {
2920         int x, startx = span->startx, endx = span->endx;
2921         for (x = startx;x < endx;x++)
2922         {
2923                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2924                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2925                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2926                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2927         }
2928 }
2929
2930 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2931 {
2932         int x, startx = span->startx, endx = span->endx;
2933         float a, b;
2934         for (x = startx;x < endx;x++)
2935         {
2936                 a = 1.0f - inb4f[x*4+3];
2937                 b = inb4f[x*4+3];
2938                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2939                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2940                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2941                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2942         }
2943 }
2944
2945 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2946 {
2947         int x, startx = span->startx, endx = span->endx;
2948         float localcolor[4], ilerp, lerp;
2949         localcolor[0] = color[0];
2950         localcolor[1] = color[1];
2951         localcolor[2] = color[2];
2952         localcolor[3] = color[3];
2953         ilerp = 1.0f - localcolor[3];
2954         lerp = localcolor[3];
2955         for (x = startx;x < endx;x++)
2956         {
2957                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2958                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2959                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2960                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2961         }
2962 }
2963
2964
2965
2966 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2967 {
2968 #ifdef SSE_POSSIBLE
2969         int x;
2970         int startx = span->startx;
2971         int endx = span->endx;
2972         __m128 data, slope;
2973         __m128 mod, endmod;
2974         __m128i submod, substep, endsubmod;
2975         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2976         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2977         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2978         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2979         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2980         for (x = startx; x < endx;)
2981         {
2982                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2983                 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2984                 if (nextsub >= endx)
2985                 {
2986                         nextsub = endsub = endx-1;
2987                         if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2988                 }
2989                 mod = endmod;
2990                 submod = endsubmod;
2991                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2992                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2993                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2994                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2995                 substep = _mm_packs_epi32(substep, substep);
2996                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2997                 {
2998                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2999                         pix = _mm_mulhi_epu16(pix, submod);
3000                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3001                 }
3002                 if (x <= endsub)
3003                 {
3004                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
3005                         pix = _mm_mulhi_epu16(pix, submod);
3006                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3007                         x++;
3008                 }
3009         }
3010 #endif
3011 }
3012
3013 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
3014 {
3015 #ifdef SSE_POSSIBLE
3016         int x;
3017         int startx = span->startx;
3018         int endx = span->endx;
3019         __m128 data, slope;
3020         __m128 mod, endmod;
3021         __m128i submod, substep, endsubmod;
3022         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3023         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3024         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3025         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3026         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3027         for (x = startx; x < endx;)
3028         {
3029                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3030                 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3031                 if (nextsub >= endx)
3032                 {
3033                         nextsub = endsub = endx-1;
3034                         if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
3035                 }
3036                 mod = endmod;
3037                 submod = endsubmod;
3038                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3039                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3040                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3041                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3042                 substep = _mm_packs_epi32(substep, substep);
3043                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3044                 {
3045                         __m128i pix = _mm_srai_epi16(submod, 4);
3046                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3047                 }
3048                 if (x <= endsub)
3049                 {
3050                         __m128i pix = _mm_srai_epi16(submod, 4);
3051                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3052                         x++;
3053                 }
3054         }
3055 #endif
3056 }
3057
3058 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3059 {
3060 #ifdef SSE_POSSIBLE
3061         int x, startx = span->startx, endx = span->endx;
3062         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3063         localcolor = _mm_packs_epi32(localcolor, localcolor);
3064         for (x = startx;x+2 <= endx;x+=2)
3065         {
3066                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3067                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3068                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3069                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3070         }
3071         if (x < endx)
3072         {
3073                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3074                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3075                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3076                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3077         }
3078 #endif
3079 }
3080
3081 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3082 {
3083 #ifdef SSE_POSSIBLE
3084         int x, startx = span->startx, endx = span->endx;
3085         for (x = startx;x+2 <= endx;x+=2)
3086         {
3087                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3088                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3089                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3090                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3091         }
3092         if (x < endx)
3093         {
3094                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3095                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3096                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3097                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3098         }
3099 #endif
3100 }
3101
3102 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3103 {
3104 #ifdef SSE_POSSIBLE
3105         int x, startx = span->startx, endx = span->endx;
3106         for (x = startx;x+2 <= endx;x+=2)
3107         {
3108                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3109                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3110                 pix1 = _mm_add_epi16(pix1, pix2);
3111                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3112         }
3113         if (x < endx)
3114         {
3115                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3116                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3117                 pix1 = _mm_add_epi16(pix1, pix2);
3118                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3119         }
3120 #endif
3121 }
3122
3123 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3124 {
3125 #ifdef SSE_POSSIBLE
3126         int x, startx = span->startx, endx = span->endx;
3127         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3128         tint = _mm_packs_epi32(tint, tint);
3129         for (x = startx;x+2 <= endx;x+=2)
3130         {
3131                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3132                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3133                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3134                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3135         }
3136         if (x < endx)
3137         {
3138                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3139                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3140                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3141                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3142         }
3143 #endif
3144 }
3145
3146 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3147 {
3148 #ifdef SSE_POSSIBLE
3149         int x, startx = span->startx, endx = span->endx;
3150         for (x = startx;x+2 <= endx;x+=2)
3151         {
3152                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3153                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3154                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3155                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3156                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3157         }
3158         if (x < endx)
3159         {
3160                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3161                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3162                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3163                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3164                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3165         }
3166 #endif
3167 }
3168
3169 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3170 {
3171 #ifdef SSE_POSSIBLE
3172         int x, startx = span->startx, endx = span->endx;
3173         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3174         localcolor = _mm_packs_epi32(localcolor, localcolor);
3175         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3176         for (x = startx;x+2 <= endx;x+=2)
3177         {
3178                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3179                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3180                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3181         }
3182         if (x < endx)
3183         {
3184                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3185                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3186                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3187         }
3188 #endif
3189 }
3190
3191
3192
3193 void DPSOFTRAST_VertexShader_Generic(void)
3194 {
3195         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3196         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3197         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3198         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3199                 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3200 }
3201
3202 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3203 {
3204         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3205         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3206         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3207         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3208         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3209         if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3210         {
3211                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3212                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3213                 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3214                 {
3215                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3216                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3217                         {
3218                                 // multiply
3219                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3220                         }
3221                         else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3222                         {
3223                                 // add
3224                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3225                         }
3226                         else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3227                         {
3228                                 // alphablend
3229                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3230                         }
3231                 }
3232         }
3233         else
3234                 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3235         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3236 }
3237
3238
3239
3240 void DPSOFTRAST_VertexShader_PostProcess(void)
3241 {
3242         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3243         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3244         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3245 }
3246
3247 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3248 {
3249         // TODO: optimize!!  at the very least there is no reason to use texture sampling on the frame texture
3250         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3251         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3252         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3253         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3254         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3255         if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3256         {
3257                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3258                 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3259         }
3260         DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3261         if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3262         {
3263                 // TODO: implement saturation
3264         }
3265         if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3266         {
3267                 // TODO: implement gammaramps
3268         }
3269         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3270 }
3271
3272
3273
3274 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3275 {
3276         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3277 }
3278
3279 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3280 {
3281         // this is never called (because colormask is off when this shader is used)
3282         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3283         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3284         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3285         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3286         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3287 }
3288
3289
3290
3291 void DPSOFTRAST_VertexShader_FlatColor(void)
3292 {
3293         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3294         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3295 }
3296
3297 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3298 {
3299 #ifdef SSE_POSSIBLE
3300         unsigned char * RESTRICT pixelmask = span->pixelmask;
3301         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3302         int x, startx = span->startx, endx = span->endx;
3303         __m128i Color_Ambientm;
3304         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3305         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3306         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3307         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3308         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3309         if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3310                 pixel = buffer_FragColorbgra8;
3311         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3312         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3313         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3314         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3315         for (x = startx;x < endx;x++)
3316         {
3317                 __m128i color, pix;
3318                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3319                 {
3320                         __m128i pix2;
3321                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3322                         pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3323                         pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3324                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3325                         x += 3;
3326                         continue;
3327                 }
3328                 if (!pixelmask[x])
3329                         continue;
3330                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3331                 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3332                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3333         }
3334         if (pixel == buffer_FragColorbgra8)
3335                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3336 #endif
3337 }
3338
3339
3340
3341 void DPSOFTRAST_VertexShader_VertexColor(void)
3342 {
3343         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3344         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3345         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3346 }
3347
3348 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3349 {
3350 #ifdef SSE_POSSIBLE
3351         unsigned char * RESTRICT pixelmask = span->pixelmask;
3352         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3353         int x, startx = span->startx, endx = span->endx;
3354         __m128i Color_Ambientm, Color_Diffusem;
3355         __m128 data, slope;
3356         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3357         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3358         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3359         int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3360         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3361         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3362         if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3363                 pixel = buffer_FragColorbgra8;
3364         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3365         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3366         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3367         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3368         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3369         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3370         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3371         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3372         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3373         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3374         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3375         data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3376         slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3377         for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3378         {
3379                 __m128i color, mod, pix;
3380                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3381                 {
3382                         __m128i pix2, mod2;
3383                         __m128 z = _mm_loadu_ps(&buffer_z[x]);
3384                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3385                         mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3386                         data = _mm_add_ps(data, slope);
3387                         mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3388                         data = _mm_add_ps(data, slope);
3389                         mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3390                         data = _mm_add_ps(data, slope);
3391                         mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3392                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3393                                                                   _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3394                         pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3395                                                                    _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3396                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3397                         x += 3;
3398                         continue;
3399                 }
3400                 if (!pixelmask[x])
3401                         continue;
3402                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3403                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
3404                 mod = _mm_packs_epi32(mod, mod);
3405                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3406                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3407         }
3408         if (pixel == buffer_FragColorbgra8)
3409                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3410 #endif
3411 }
3412
3413
3414
3415 void DPSOFTRAST_VertexShader_Lightmap(void)
3416 {
3417         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3418         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3419         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3420 }
3421
3422 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3423 {
3424 #ifdef SSE_POSSIBLE
3425         unsigned char * RESTRICT pixelmask = span->pixelmask;
3426         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3427         int x, startx = span->startx, endx = span->endx;
3428         __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3429         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3430         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3431         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3432         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3433         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3434         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3435         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3436         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3437         if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3438                 pixel = buffer_FragColorbgra8;
3439         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3440         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3441         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3442         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3443         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3444         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3445         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3446         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3447         {
3448                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3449                 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3450                 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3451                 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3452                 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3453                 for (x = startx;x < endx;x++)
3454                 {
3455                         __m128i color, lightmap, glow, pix;
3456                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3457                         {
3458                                 __m128i pix2;
3459                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3460                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3461                                 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3462                                 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3463                                                                                                         _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3464                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3465                                 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3466                                                                                                         _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3467                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3468                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3469                                 x += 3;
3470                                 continue;
3471                         }
3472                         if (!pixelmask[x])
3473                                 continue;
3474                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3475                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3476                         glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3477                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3478                         pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3479                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3480                 }
3481         }
3482         else
3483         {
3484                 for (x = startx;x < endx;x++)
3485                 {
3486                         __m128i color, lightmap, pix;
3487                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3488                         {
3489                                 __m128i pix2;
3490                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3491                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3492                                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3493                                                                           _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3494                                 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3495                                                                            _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3496                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3497                                 x += 3;
3498                                 continue;
3499                         }
3500                         if (!pixelmask[x]) 
3501                                 continue;
3502                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3503                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3504                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3505                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3506                 }
3507         }
3508         if (pixel == buffer_FragColorbgra8)
3509                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3510 #endif
3511 }
3512
3513
3514 void DPSOFTRAST_VertexShader_LightDirection(void);
3515 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3516
3517 void DPSOFTRAST_VertexShader_FakeLight(void)
3518 {
3519         DPSOFTRAST_VertexShader_LightDirection();
3520 }
3521
3522 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3523 {
3524         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3525 }
3526
3527
3528
3529 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3530 {
3531         DPSOFTRAST_VertexShader_LightDirection();
3532         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3533 }
3534
3535 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3536 {
3537         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3538 }
3539
3540
3541
3542 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3543 {
3544         DPSOFTRAST_VertexShader_LightDirection();
3545         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3546 }
3547
3548 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3549 {
3550         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3551 }
3552
3553
3554
3555 void DPSOFTRAST_VertexShader_LightDirection(void)
3556 {
3557         int i;
3558         int numvertices = dpsoftrast.numvertices;
3559         float LightDir[4];
3560         float LightVector[4];
3561         float EyePosition[4];
3562         float EyeVectorModelSpace[4];
3563         float EyeVector[4];
3564         float position[4];
3565         float svector[4];
3566         float tvector[4];
3567         float normal[4];
3568         LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3569         LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3570         LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3571         LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3572         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3573         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3574         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3575         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3576         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3577         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3578         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3579         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3580         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3581         for (i = 0;i < numvertices;i++)
3582         {
3583                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3584                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3585                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3586                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3587                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3588                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3589                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3590                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3591                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3592                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3593                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3594                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3595                 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3596                 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3597                 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3598                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3599                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3600                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3601                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3602                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3603                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3604                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3605                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3606                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3607                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3608                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3609                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3610                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3611                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3612         }
3613         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3614 }
3615
3616 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3617 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3618 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3619 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3620 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3621 #define DPSOFTRAST_Vector3Normalize(v)\
3622 do\
3623 {\
3624         float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3625         if (len)\
3626         {\
3627                 len = 1.0f / len;\
3628                 v[0] *= len;\
3629                 v[1] *= len;\
3630                 v[2] *= len;\
3631         }\
3632 }\
3633 while(0)
3634
3635 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3636 {
3637         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3638         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3639         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3640         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3641         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3642         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3643         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3644         unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3645         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3646         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3647         int x, startx = span->startx, endx = span->endx;
3648         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3649         float LightVectordata[4];
3650         float LightVectorslope[4];
3651         float EyeVectordata[4];
3652         float EyeVectorslope[4];
3653         float VectorSdata[4];
3654         float VectorSslope[4];
3655         float VectorTdata[4];
3656         float VectorTslope[4];
3657         float VectorRdata[4];
3658         float VectorRslope[4];
3659         float z;
3660         float diffusetex[4];
3661         float glosstex[4];
3662         float surfacenormal[4];
3663         float lightnormal[4];
3664         float lightnormal_modelspace[4];
3665         float eyenormal[4];
3666         float specularnormal[4];
3667         float diffuse;
3668         float specular;
3669         float SpecularPower;
3670         int d[4];
3671         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3672         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3673         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3674         Color_Glow[3] = 0.0f;
3675         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3676         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3677         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3678         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3679         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3680         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3681         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3682         Color_Pants[3] = 0.0f;
3683         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3684         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3685         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3686         Color_Shirt[3] = 0.0f;
3687         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3688         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3689         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3690         {
3691                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3692                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3693         }
3694         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3695         {
3696                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3697         }
3698         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3699         {
3700                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3701                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3702                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3703                 Color_Diffuse[3] = 0.0f;
3704                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3705                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3706                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3707                 LightColor[3] = 0.0f;
3708                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3709                 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3710                 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3711                 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3712                 Color_Specular[3] = 0.0f;
3713                 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3714                 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3715                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3716
3717                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3718                 {
3719                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3720                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3721                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3722                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3723                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3724                 }
3725                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3726                 {
3727                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3728                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3729                 }
3730                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3731                 {
3732                         // nothing of this needed
3733                 }
3734                 else
3735                 {
3736                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3737                 }
3738
3739                 for (x = startx;x < endx;x++)
3740                 {
3741                         z = buffer_z[x];
3742                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3743                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3744                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3745                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3746                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3747                         {
3748                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3749                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3750                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3751                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3752                         }
3753                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3754                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3755                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3756                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3757                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3758                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3759                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3760                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3761
3762                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3763                         {
3764                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3765                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3766                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3767                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3768
3769                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3770                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3771                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3772                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3773
3774                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3775                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3776                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3777                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3778
3779                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3780                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3781                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3782                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3783
3784                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3785                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3786
3787                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3788                                 {
3789                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3790                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3791                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3792                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3793                                 }
3794                         }
3795                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3796                         {
3797                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3798                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3799                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3800                                 {
3801                                         float f = 1.0f / 256.0f;
3802                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3803                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3804                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3805                                 }
3806                         }
3807                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3808                         {
3809                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3810                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3811                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3812                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3813
3814                                 LightColor[0] = 1.0;
3815                                 LightColor[1] = 1.0;
3816                                 LightColor[2] = 1.0;
3817                         }
3818                         else
3819                         {
3820                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3821                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3822                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3823                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3824                         }
3825
3826                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3827
3828                         if(thread->shader_exactspecularmath)
3829                         {
3830                                 // reflect lightnormal at surfacenormal, take the negative of that
3831                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3832                                 float f;
3833                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3834                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3835                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3836                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3837
3838                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
3839                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3840                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3841                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3842                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3843
3844                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3845                         }
3846                         else
3847                         {
3848                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3849                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3850                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3851                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3852
3853                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
3854                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
3855                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
3856                                 DPSOFTRAST_Vector3Normalize(specularnormal);
3857
3858                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3859                         }
3860
3861                         specular = pow(specular, SpecularPower * glosstex[3]);
3862                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3863                         {
3864                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3865                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3866                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3867                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3868                         }
3869                         else
3870                         {
3871                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3872                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3873                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3874                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3875                         }
3876
3877                         buffer_FragColorbgra8[x*4+0] = d[0];
3878                         buffer_FragColorbgra8[x*4+1] = d[1];
3879                         buffer_FragColorbgra8[x*4+2] = d[2];
3880                         buffer_FragColorbgra8[x*4+3] = d[3];
3881                 }
3882         }
3883         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3884         {
3885                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3886                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3887                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3888                 Color_Diffuse[3] = 0.0f;
3889                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3890                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3891                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3892                 LightColor[3] = 0.0f;
3893                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3894
3895                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3896                 {
3897                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3898                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3899                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3900                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3901                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3902                 }
3903                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3904                 {
3905                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3906                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3907                 }
3908                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3909                 {
3910                         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3911                 }
3912                 else
3913                 {
3914                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3915                 }
3916
3917                 for (x = startx;x < endx;x++)
3918                 {
3919                         z = buffer_z[x];
3920                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3921                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3922                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3923                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3924                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3925                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3926                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3927                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3928
3929                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3930                         {
3931                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3932                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3933                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3934                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3935
3936                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3937                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3938                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3939                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3940
3941                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3942                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3943                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3944                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3945
3946                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3947                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3948                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3949                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3950
3951                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3952                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3953
3954                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3955                                 {
3956                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3957                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3958                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3959                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3960                                 }
3961                         }
3962                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3963                         {
3964                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3965                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3966                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3967                                 {
3968                                         float f = 1.0f / 256.0f;
3969                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3970                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3971                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3972                                 }
3973                         }
3974                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3975                         {
3976                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3977                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3978                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3979                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3980
3981                                 LightColor[0] = 1.0;
3982                                 LightColor[1] = 1.0;
3983                                 LightColor[2] = 1.0;
3984                         }
3985                         else
3986                         {
3987                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3988                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3989                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3990                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3991                         }
3992
3993                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3994                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3995                         {
3996                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3997                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3998                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3999                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
4000                         }
4001                         else
4002                         {
4003                                 d[0] = (int)(                                                + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4004                                 d[1] = (int)(                                                + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4005                                 d[2] = (int)(                                                + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4006                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
4007                         }
4008                         buffer_FragColorbgra8[x*4+0] = d[0];
4009                         buffer_FragColorbgra8[x*4+1] = d[1];
4010                         buffer_FragColorbgra8[x*4+2] = d[2];
4011                         buffer_FragColorbgra8[x*4+3] = d[3];
4012                 }
4013         }
4014         else
4015         {
4016                 for (x = startx;x < endx;x++)
4017                 {
4018                         z = buffer_z[x];
4019                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4020                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4021                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4022                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4023
4024                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4025                         {
4026                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4027                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4028                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4029                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4030                         }
4031                         else
4032                         {
4033                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4034                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4035                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4036                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4037                         }
4038                         buffer_FragColorbgra8[x*4+0] = d[0];
4039                         buffer_FragColorbgra8[x*4+1] = d[1];
4040                         buffer_FragColorbgra8[x*4+2] = d[2];
4041                         buffer_FragColorbgra8[x*4+3] = d[3];
4042                 }
4043         }
4044         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4045 }
4046
4047
4048
4049 void DPSOFTRAST_VertexShader_LightSource(void)
4050 {
4051         int i;
4052         int numvertices = dpsoftrast.numvertices;
4053         float LightPosition[4];
4054         float LightVector[4];
4055         float LightVectorModelSpace[4];
4056         float EyePosition[4];
4057         float EyeVectorModelSpace[4];
4058         float EyeVector[4];
4059         float position[4];
4060         float svector[4];
4061         float tvector[4];
4062         float normal[4];
4063         LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4064         LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4065         LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4066         LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4067         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4068         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4069         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4070         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4071         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4072         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4073         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4074         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4075         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4076         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4077         for (i = 0;i < numvertices;i++)
4078         {
4079                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4080                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4081                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4082                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4083                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4084                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4085                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4086                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4087                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4088                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4089                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4090                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4091                 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4092                 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4093                 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4094                 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4095                 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4096                 LightVector[2] = normal[0]  * LightVectorModelSpace[0] + normal[1]  * LightVectorModelSpace[1] + normal[2]  * LightVectorModelSpace[2];
4097                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4098                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4099                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4100                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4101                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4102                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4103                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4104                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4105                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4106                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
4107                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4108                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4109                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4110                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4111         }
4112         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4113         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4114 }
4115
4116 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4117 {
4118 #ifdef SSE_POSSIBLE
4119         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4120         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4121         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4122         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4123         unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4124         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4125         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4126         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4127         int x, startx = span->startx, endx = span->endx;
4128         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4129         float CubeVectordata[4];
4130         float CubeVectorslope[4];
4131         float LightVectordata[4];
4132         float LightVectorslope[4];
4133         float EyeVectordata[4];
4134         float EyeVectorslope[4];
4135         float z;
4136         float diffusetex[4];
4137         float glosstex[4];
4138         float surfacenormal[4];
4139         float lightnormal[4];
4140         float eyenormal[4];
4141         float specularnormal[4];
4142         float diffuse;
4143         float specular;
4144         float SpecularPower;
4145         float CubeVector[4];
4146         float attenuation;
4147         int d[4];
4148         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4149         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4150         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4151         Color_Glow[3] = 0.0f;
4152         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4153         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4154         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4155         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4156         Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4157         Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4158         Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4159         Color_Diffuse[3] = 0.0f;
4160         Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4161         Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4162         Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4163         Color_Specular[3] = 0.0f;
4164         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4165         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4166         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4167         Color_Pants[3] = 0.0f;
4168         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4169         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4170         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4171         Color_Shirt[3] = 0.0f;
4172         LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4173         LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4174         LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4175         LightColor[3] = 0.0f;
4176         SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4177         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4178         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4179         DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4180         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4181         memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4182         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4183         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4184         {
4185                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4186                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4187         }
4188         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4189                 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4190         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4191         {
4192                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4193                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4194                 for (x = startx;x < endx;x++)
4195                 {
4196                         z = buffer_z[x];
4197                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4198                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4199                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4200                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4201                         if (attenuation < 0.01f)
4202                                 continue;
4203                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4204                         {
4205                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4206                                 if (attenuation < 0.01f)
4207                                         continue;
4208                         }
4209
4210                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4211                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4212                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4213                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4214                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4215                         {
4216                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4217                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4218                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4219                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4220                         }
4221                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4222                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4223                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4224                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4225                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4226                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4227                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4228                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4229
4230                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4231                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4232                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4233                         DPSOFTRAST_Vector3Normalize(lightnormal);
4234
4235                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4236
4237                         if(thread->shader_exactspecularmath)
4238                         {
4239                                 // reflect lightnormal at surfacenormal, take the negative of that
4240                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4241                                 float f;
4242                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4243                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4244                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4245                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4246
4247                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
4248                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4249                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4250                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4251                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4252
4253                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4254                         }
4255                         else
4256                         {
4257                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4258                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4259                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4260                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4261
4262                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
4263                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
4264                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
4265                                 DPSOFTRAST_Vector3Normalize(specularnormal);
4266
4267                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4268                         }
4269                         specular = pow(specular, SpecularPower * glosstex[3]);
4270
4271                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4272                         {
4273                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4274                                 attenuation *= (1.0f / 255.0f);
4275                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4276                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4277                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4278                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4279                         }
4280                         else
4281                         {
4282                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4283                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4284                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4285                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4286                         }
4287                         buffer_FragColorbgra8[x*4+0] = d[0];
4288                         buffer_FragColorbgra8[x*4+1] = d[1];
4289                         buffer_FragColorbgra8[x*4+2] = d[2];
4290                         buffer_FragColorbgra8[x*4+3] = d[3];
4291                 }
4292         }
4293         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4294         {
4295                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4296                 for (x = startx;x < endx;x++)
4297                 {
4298                         z = buffer_z[x];
4299                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4300                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4301                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4302                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4303                         if (attenuation < 0.01f)
4304                                 continue;
4305                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4306                         {
4307                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4308                                 if (attenuation < 0.01f)
4309                                         continue;
4310                         }
4311
4312                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4313                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4314                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4315                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4316                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4317                         {
4318                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4319                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4320                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4321                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4322                         }
4323                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4324                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4325                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4326                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4327
4328                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4329                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4330                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4331                         DPSOFTRAST_Vector3Normalize(lightnormal);
4332
4333                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4334                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4335                         {
4336                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4337                                 attenuation *= (1.0f / 255.0f);
4338                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4339                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4340                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4341                                 d[3] = (int)( diffusetex[3]                                                                                                                   );if (d[3] > 255) d[3] = 255;
4342                         }
4343                         else
4344                         {
4345                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4346                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4347                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4348                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4349                         }
4350                         buffer_FragColorbgra8[x*4+0] = d[0];
4351                         buffer_FragColorbgra8[x*4+1] = d[1];
4352                         buffer_FragColorbgra8[x*4+2] = d[2];
4353                         buffer_FragColorbgra8[x*4+3] = d[3];
4354                 }
4355         }
4356         else
4357         {
4358                 for (x = startx;x < endx;x++)
4359                 {
4360                         z = buffer_z[x];
4361                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4362                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4363                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4364                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4365                         if (attenuation < 0.01f)
4366                                 continue;
4367                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4368                         {
4369                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4370                                 if (attenuation < 0.01f)
4371                                         continue;
4372                         }
4373
4374                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4375                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4376                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4377                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4378                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4379                         {
4380                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4381                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4382                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4383                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4384                         }
4385                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4386                         {
4387                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4388                                 attenuation *= (1.0f / 255.0f);
4389                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4390                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4391                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4392                                 d[3] = (int)( diffusetex[3]                                                                                      );if (d[3] > 255) d[3] = 255;
4393                         }
4394                         else
4395                         {
4396                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4397                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4398                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4399                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4400                         }
4401                         buffer_FragColorbgra8[x*4+0] = d[0];
4402                         buffer_FragColorbgra8[x*4+1] = d[1];
4403                         buffer_FragColorbgra8[x*4+2] = d[2];
4404                         buffer_FragColorbgra8[x*4+3] = d[3];
4405                 }
4406         }
4407         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4408 #endif
4409 }
4410
4411
4412
4413 void DPSOFTRAST_VertexShader_Refraction(void)
4414 {
4415         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4416         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4417         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4418 }
4419
4420 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4421 {
4422         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4423         float z;
4424         int x, startx = span->startx, endx = span->endx;
4425
4426         // texture reads
4427         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4428         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4429
4430         // varyings
4431         float ModelViewProjectionPositiondata[4];
4432         float ModelViewProjectionPositionslope[4];
4433
4434         // uniforms
4435         float ScreenScaleRefractReflect[2];
4436         float ScreenCenterRefractReflect[2];
4437         float DistortScaleRefractReflect[2];
4438         float RefractColor[4];
4439
4440         DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4441         if(!texture) return;
4442
4443         // read textures
4444         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4445         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4446
4447         // read varyings
4448         DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4449
4450         // read uniforms
4451         ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4452         ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4453         ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4454         ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4455         DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4456         DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4457         RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4458         RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4459         RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4460         RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4461
4462         // do stuff
4463         for (x = startx;x < endx;x++)
4464         {
4465                 float SafeScreenTexCoord[2];
4466                 float ScreenTexCoord[2];
4467                 float v[3];
4468                 float iw;
4469                 unsigned char c[4];
4470
4471                 z = buffer_z[x];
4472
4473                 // "    vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4474                 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4475
4476                 // "    vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4477                 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4478                 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4479
4480                 // "    vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4481                 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4482                 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4483                 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4484                 DPSOFTRAST_Vector3Normalize(v);
4485                 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4486                 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4487
4488                 // "    dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4489                 DPSOFTRAST_Texture2DBGRA8(texture, 0, ScreenTexCoord[0], ScreenTexCoord[1], c);
4490
4491                 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4492                 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4493                 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4494                 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4495         }
4496
4497         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4498 }
4499
4500
4501
4502 void DPSOFTRAST_VertexShader_Water(void)
4503 {
4504         int i;
4505         int numvertices = dpsoftrast.numvertices;
4506         float EyePosition[4];
4507         float EyeVectorModelSpace[4];
4508         float EyeVector[4];
4509         float position[4];
4510         float svector[4];
4511         float tvector[4];
4512         float normal[4];
4513         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4514         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4515         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4516         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4517         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4518         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4519         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4520         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4521         for (i = 0;i < numvertices;i++)
4522         {
4523                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4524                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4525                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4526                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4527                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4528                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4529                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4530                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4531                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4532                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4533                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4534                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4535                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4536                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4537                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4538                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4539                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4540                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
4541                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
4542                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
4543                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
4544                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
4545         }
4546         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4547         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4548         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4549 }
4550
4551
4552 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4553 {
4554         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4555         float z;
4556         int x, startx = span->startx, endx = span->endx;
4557
4558         // texture reads
4559         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4560         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4561
4562         // varyings
4563         float ModelViewProjectionPositiondata[4];
4564         float ModelViewProjectionPositionslope[4];
4565         float EyeVectordata[4];
4566         float EyeVectorslope[4];
4567
4568         // uniforms
4569         float ScreenScaleRefractReflect[4];
4570         float ScreenCenterRefractReflect[4];
4571         float DistortScaleRefractReflect[4];
4572         float RefractColor[4];
4573         float ReflectColor[4];
4574         float ReflectFactor;
4575         float ReflectOffset;
4576
4577         DPSOFTRAST_Texture *texture_refraction = thread->texbound[GL20TU_REFRACTION];
4578         DPSOFTRAST_Texture *texture_reflection = thread->texbound[GL20TU_REFLECTION];
4579         if(!texture_refraction || !texture_reflection) return;
4580
4581         // read textures
4582         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4583         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4584
4585         // read varyings
4586         DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4587         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
4588
4589         // read uniforms
4590         ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4591         ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4592         ScreenScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+2];
4593         ScreenScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+3];
4594         ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4595         ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4596         ScreenCenterRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+2];
4597         ScreenCenterRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+3];
4598         DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4599         DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4600         DistortScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+2];
4601         DistortScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+3];
4602         RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4603         RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4604         RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4605         RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4606         ReflectColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+2];
4607         ReflectColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+1];
4608         ReflectColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+0];
4609         ReflectColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+3];
4610         ReflectFactor = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectFactor*4+0];
4611         ReflectOffset = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectOffset*4+0];
4612
4613         // do stuff
4614         for (x = startx;x < endx;x++)
4615         {
4616                 float SafeScreenTexCoord[4];
4617                 float ScreenTexCoord[4];
4618                 float v[3];
4619                 float iw;
4620                 unsigned char c1[4];
4621                 unsigned char c2[4];
4622                 float Fresnel;
4623
4624                 z = buffer_z[x];
4625
4626                 // "    vec4 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect * (1.0 / ModelViewProjectionPosition.w);\n"
4627                 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4628
4629                 // "    vec4 SafeScreenTexCoord = ModelViewProjectionPosition.xyxy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect;\n"
4630                 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4631                 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4632                 SafeScreenTexCoord[2] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[2] + ScreenCenterRefractReflect[2]; // * z (disappears)
4633                 SafeScreenTexCoord[3] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[3] + ScreenCenterRefractReflect[3]; // * z (disappears)
4634
4635                 // "    vec4 ScreenTexCoord = SafeScreenTexCoord + vec2(normalize(vec3(dp_texture2D(Texture_Normal, TexCoord)) - vec3(0.5))).xyxy * DistortScaleRefractReflect;\n"
4636                 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4637                 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4638                 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4639                 DPSOFTRAST_Vector3Normalize(v);
4640                 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4641                 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4642                 ScreenTexCoord[2] = SafeScreenTexCoord[2] + v[0] * DistortScaleRefractReflect[2];
4643                 ScreenTexCoord[3] = SafeScreenTexCoord[3] + v[1] * DistortScaleRefractReflect[3];
4644
4645                 // "    float Fresnel = pow(min(1.0, 1.0 - float(normalize(EyeVector).z)), 2.0) * ReflectFactor + ReflectOffset;\n"
4646                 v[0] = (EyeVectordata[0] + EyeVectorslope[0] * x); // * z (disappears)
4647                 v[1] = (EyeVectordata[1] + EyeVectorslope[1] * x); // * z (disappears)
4648                 v[2] = (EyeVectordata[2] + EyeVectorslope[2] * x); // * z (disappears)
4649                 DPSOFTRAST_Vector3Normalize(v);
4650                 Fresnel = 1.0f - v[2];
4651                 Fresnel = min(1.0f, Fresnel);
4652                 Fresnel = Fresnel * Fresnel * ReflectFactor + ReflectOffset;
4653
4654                 // "    dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4655                 // "    dp_FragColor = mix(vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord.xy).rgb, 1) * RefractColor, vec4(dp_texture2D(Texture_Reflection, ScreenTexCoord.zw).rgb, 1) * ReflectColor, Fresnel);\n"
4656                 DPSOFTRAST_Texture2DBGRA8(texture_refraction, 0, ScreenTexCoord[0], ScreenTexCoord[1], c1);
4657                 DPSOFTRAST_Texture2DBGRA8(texture_reflection, 0, ScreenTexCoord[2], ScreenTexCoord[3], c2);
4658
4659                 buffer_FragColorbgra8[x*4+0] = (c1[0] * RefractColor[0]) * (1.0f - Fresnel) + (c2[0] * ReflectColor[0]) * Fresnel;
4660                 buffer_FragColorbgra8[x*4+1] = (c1[1] * RefractColor[1]) * (1.0f - Fresnel) + (c2[1] * ReflectColor[1]) * Fresnel;
4661                 buffer_FragColorbgra8[x*4+2] = (c1[2] * RefractColor[2]) * (1.0f - Fresnel) + (c2[2] * ReflectColor[2]) * Fresnel;
4662                 buffer_FragColorbgra8[x*4+3] = min((    RefractColor[3] *  (1.0f - Fresnel) +          ReflectColor[3]  * Fresnel) * 256, 255);
4663         }
4664
4665         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4666 }
4667
4668
4669
4670 void DPSOFTRAST_VertexShader_ShowDepth(void)
4671 {
4672         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4673 }
4674
4675 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4676 {
4677         // TODO: IMPLEMENT
4678         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4679         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4680         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4681         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4682         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4683 }
4684
4685
4686
4687 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4688 {
4689         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4690 }
4691
4692 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4693 {
4694         // TODO: IMPLEMENT
4695         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4696         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4697         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4698         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4699         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4700 }
4701
4702
4703
4704 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4705 {
4706         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4707 }
4708
4709 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4710 {
4711         // TODO: IMPLEMENT
4712         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4713         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4714         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4715         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4716         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4717 }
4718
4719
4720
4721 typedef struct DPSOFTRAST_ShaderModeInfo_s
4722 {
4723         int lodarrayindex;
4724         void (*Vertex)(void);
4725         void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4726         unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4727         unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4728 }
4729 DPSOFTRAST_ShaderModeInfo;
4730
4731 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4732 {
4733         {2, DPSOFTRAST_VertexShader_Generic,                        DPSOFTRAST_PixelShader_Generic,                        {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4734         {2, DPSOFTRAST_VertexShader_PostProcess,                    DPSOFTRAST_PixelShader_PostProcess,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4735         {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow,                DPSOFTRAST_PixelShader_Depth_Or_Shadow,                {~0}, {~0}},
4736         {2, DPSOFTRAST_VertexShader_FlatColor,                      DPSOFTRAST_PixelShader_FlatColor,                      {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4737         {2, DPSOFTRAST_VertexShader_VertexColor,                    DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4738         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4739         {2, DPSOFTRAST_VertexShader_FakeLight,                      DPSOFTRAST_PixelShader_FakeLight,                      {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4740         {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace,   {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4741         {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4742         {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,                 {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4743         {2, DPSOFTRAST_VertexShader_LightSource,                    DPSOFTRAST_PixelShader_LightSource,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4744         {2, DPSOFTRAST_VertexShader_Refraction,                     DPSOFTRAST_PixelShader_Refraction,                     {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4745         {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                          {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_NORMAL, GL20TU_REFLECTION, GL20TU_REFRACTION, ~0}},
4746         {2, DPSOFTRAST_VertexShader_ShowDepth,                      DPSOFTRAST_PixelShader_ShowDepth,                      {~0}},
4747         {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,               {~0}},
4748         {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}},
4749 };
4750
4751 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span)
4752 {
4753         int x;
4754         int startx;
4755         int endx;
4756         unsigned int *depthpixel;
4757         int depth;
4758         int depthslope;
4759         unsigned int d;
4760         unsigned char *pixelmask;
4761         DPSOFTRAST_State_Triangle *triangle;
4762         triangle = &thread->triangles[span->triangle];
4763         depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4764         startx = span->startx;
4765         endx = span->endx;
4766         depth = span->depthbase;
4767         depthslope = span->depthslope;
4768         pixelmask = thread->pixelmaskarray;
4769         if (thread->depthtest && dpsoftrast.fb_depthpixels)
4770         {
4771                 switch(thread->fb_depthfunc)
4772                 {
4773                 default:
4774                 case GL_ALWAYS:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4775                 case GL_LESS:    for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4776                 case GL_LEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4777                 case GL_EQUAL:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4778                 case GL_GEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4779                 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4780                 case GL_NEVER:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4781                 }
4782                 while (startx < endx && !pixelmask[startx])
4783                         startx++;
4784                 while (endx > startx && !pixelmask[endx-1])
4785                         endx--;
4786         }
4787         else
4788         {
4789                 // no depth testing means we're just dealing with color...
4790                 memset(pixelmask + startx, 1, endx - startx);
4791         }
4792         span->pixelmask = pixelmask;
4793         span->startx = startx;
4794         span->endx = endx;
4795 }
4796
4797 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span)
4798 {
4799         int x, d, depth, depthslope, startx, endx;
4800         const unsigned char *pixelmask;
4801         unsigned int *depthpixel;
4802         if (thread->depthmask && thread->depthtest && dpsoftrast.fb_depthpixels)
4803         {
4804                 depth = span->depthbase;
4805                 depthslope = span->depthslope;
4806                 pixelmask = span->pixelmask;
4807                 startx = span->startx;
4808                 endx = span->endx;
4809                 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4810                 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4811                         if (pixelmask[x])
4812                                 depthpixel[x] = d;
4813         }
4814 }
4815
4816 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4817 {
4818         int i;
4819         DPSOFTRAST_State_Triangle *triangle;
4820         DPSOFTRAST_State_Span *span;
4821         for (i = 0; i < thread->numspans; i++)
4822         {
4823                 span = &thread->spans[i];
4824                 triangle = &thread->triangles[span->triangle];
4825                 DPSOFTRAST_Draw_DepthTest(thread, span);
4826                 if (span->startx >= span->endx)
4827                         continue;
4828                 // run pixel shader if appropriate
4829                 // do this before running depthmask code, to allow the pixelshader
4830                 // to clear pixelmask values for alpha testing
4831                 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4832                         DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4833                 DPSOFTRAST_Draw_DepthWrite(thread, span);
4834         }
4835         thread->numspans = 0;
4836 }
4837
4838 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4839
4840 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4841 {
4842 #ifdef SSE_POSSIBLE
4843         int cullface = thread->cullface;
4844         int minx, maxx, miny, maxy;
4845         int miny1, maxy1, miny2, maxy2;
4846         __m128i fbmin, fbmax;
4847         __m128 viewportcenter, viewportscale;
4848         int firstvertex = command->firstvertex;
4849         int numvertices = command->numvertices;
4850         int numtriangles = command->numtriangles;
4851         const int *element3i = command->element3i;
4852         const unsigned short *element3s = command->element3s;
4853         int clipped = command->clipped;
4854         int i;
4855         int j;
4856         int k;
4857         int y;
4858         int e[3];
4859         __m128i screeny;
4860         int starty, endy, bandy;
4861         int numpoints;
4862         int clipcase;
4863         float clipdist[4];
4864         float clip0origin, clip0slope;
4865         int clip0dir;
4866         __m128 triangleedge1, triangleedge2, trianglenormal;
4867         __m128 clipfrac[3];
4868         __m128 screen[4];
4869         DPSOFTRAST_State_Triangle *triangle;
4870         DPSOFTRAST_Texture *texture;
4871         DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4872         miny = thread->fb_scissor[1];
4873         maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4874         miny1 = bound(miny, thread->miny1, maxy);
4875         maxy1 = bound(miny, thread->maxy1, maxy);
4876         miny2 = bound(miny, thread->miny2, maxy);
4877         maxy2 = bound(miny, thread->maxy2, maxy);
4878         if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4879         {
4880                 if (!ATOMIC_DECREMENT(command->refcount))
4881                 {
4882                         if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4883                                 MM_FREE(command->arrays);
4884                 }
4885                 return;
4886         }
4887         minx = thread->fb_scissor[0];
4888         maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4889         fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4890         fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4891         viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4892         viewportscale = _mm_load_ps(thread->fb_viewportscale);
4893         screen[3] = _mm_setzero_ps();
4894         clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4895         for (i = 0;i < numtriangles;i++)
4896         {
4897                 const float *screencoord4f = command->arrays;
4898                 const float *arrays = screencoord4f + numvertices*4;
4899
4900                 // generate the 3 edges of this triangle
4901                 // generate spans for the triangle - switch based on left split or right split classification of triangle
4902                 if (element3s)
4903                 {
4904                         e[0] = element3s[i*3+0] - firstvertex;
4905                         e[1] = element3s[i*3+1] - firstvertex;
4906                         e[2] = element3s[i*3+2] - firstvertex;
4907                 }
4908                 else if (element3i)
4909                 {
4910                         e[0] = element3i[i*3+0] - firstvertex;
4911                         e[1] = element3i[i*3+1] - firstvertex;
4912                         e[2] = element3i[i*3+2] - firstvertex;
4913                 }
4914                 else
4915                 {
4916                         e[0] = i*3+0;
4917                         e[1] = i*3+1;
4918                         e[2] = i*3+2;
4919                 }
4920
4921 #define SKIPBACKFACE \
4922                 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4923                 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4924                 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4925                 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4926                                                                         _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4927                 switch(cullface) \
4928                 { \
4929                 case GL_BACK: \
4930                         if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4931                                 continue; \
4932                         break; \
4933                 case GL_FRONT: \
4934                         if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4935                                 continue; \
4936                         break; \
4937                 }
4938
4939 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4940                         clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4941                         { \
4942                                 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4943                                 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4944                         }
4945 #define CLIPPEDVERTEXCOPY(k,p1) \
4946                         screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4947
4948 #define GENATTRIBCOPY(attrib, p1) \
4949                 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4950 #define GENATTRIBLERP(attrib, p1, p2) \
4951                 { \
4952                         __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4953                         attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4954                 }
4955 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4956                 switch(clipcase) \
4957                 { \
4958                 default: \
4959                 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4960                 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4961                 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4962                 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4963                 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4964                 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4965                 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4966                 }
4967
4968                 if (! clipped)
4969                         goto notclipped;
4970
4971                 // calculate distance from nearplane
4972                 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4973                 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4974                 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4975                 if (clipdist[0] >= 0.0f)
4976                 {
4977                         if (clipdist[1] >= 0.0f)
4978                         {
4979                                 if (clipdist[2] >= 0.0f)
4980                                 {
4981                                 notclipped:
4982                                         // triangle is entirely in front of nearplane
4983                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4984                                         SKIPBACKFACE;
4985                                         numpoints = 3;
4986                                         clipcase = 0;
4987                                 }
4988                                 else
4989                                 {
4990                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4991                                         SKIPBACKFACE;
4992                                         numpoints = 4;
4993                                         clipcase = 1;
4994                                 }
4995                         }
4996                         else
4997                         {
4998                                 if (clipdist[2] >= 0.0f)
4999                                 {
5000                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
5001                                         SKIPBACKFACE;
5002                                         numpoints = 4;
5003                                         clipcase = 2;
5004                                 }
5005                                 else
5006                                 {
5007                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
5008                                         SKIPBACKFACE;
5009                                         numpoints = 3;
5010                                         clipcase = 3;
5011                                 }
5012                         }
5013                 }
5014                 else if (clipdist[1] >= 0.0f)
5015                 {
5016                         if (clipdist[2] >= 0.0f)
5017                         {
5018                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
5019                                 SKIPBACKFACE;
5020                                 numpoints = 4;
5021                                 clipcase = 4;
5022                         }
5023                         else
5024                         {
5025                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
5026                                 SKIPBACKFACE;
5027                                 numpoints = 3;
5028                                 clipcase = 5;
5029                         }
5030                 }
5031                 else if (clipdist[2] >= 0.0f)
5032                 {
5033                         CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
5034                         SKIPBACKFACE;
5035                         numpoints = 3;
5036                         clipcase = 6;
5037                 }
5038                 else continue; // triangle is entirely behind nearplane
5039
5040                 {
5041                         // calculate integer y coords for triangle points
5042                         __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
5043                                         screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
5044                                         screenmin = _mm_min_epi16(screeni, screenir),
5045                                         screenmax = _mm_max_epi16(screeni, screenir);
5046                         screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
5047                         screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
5048                         screenmin = _mm_max_epi16(screenmin, fbmin);
5049                         screenmax = _mm_min_epi16(screenmax, fbmax);
5050                         // skip offscreen triangles
5051                         if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
5052                                 continue;
5053                         starty = _mm_extract_epi16(screenmin, 1);
5054                         endy = _mm_extract_epi16(screenmax, 1)+1;
5055                         if (starty >= maxy1 && endy <= miny2)
5056                                 continue;
5057                         screeny = _mm_srai_epi32(screeni, 16);
5058                 }
5059
5060                 triangle = &thread->triangles[thread->numtriangles];
5061
5062                 // calculate attribute plans for triangle data...
5063                 // okay, this triangle is going to produce spans, we'd better project
5064                 // the interpolants now (this is what gives perspective texturing),
5065                 // this consists of simply multiplying all arrays by the W coord
5066                 // (which is basically 1/Z), which will be undone per-pixel
5067                 // (multiplying by Z again) to get the perspective-correct array
5068                 // values
5069                 {
5070                         __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
5071                         __m128 mipedgescale, mipdensity;
5072                         attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
5073                         attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
5074                         attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
5075                         attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
5076                         attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
5077                         w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
5078                         w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
5079                         w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
5080                         attribedge1 = _mm_sub_ss(w0, w1);
5081                         attribedge2 = _mm_sub_ss(w2, w1);
5082                         attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5083                         attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5084                         x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
5085                         y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
5086                         attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5087                         _mm_store_ss(&triangle->w[0], attribxslope);
5088                         _mm_store_ss(&triangle->w[1], attribyslope);
5089                         _mm_store_ss(&triangle->w[2], attriborigin);
5090                         
5091                         clip0origin = 0;
5092                         clip0slope = 0;
5093                         clip0dir = 0;
5094                         if(thread->fb_clipplane[0] || thread->fb_clipplane[1] || thread->fb_clipplane[2])
5095                         {
5096                                 float cliporigin, clipxslope, clipyslope;
5097                                 attriborigin = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(2, 2, 2, 2));
5098                                 attribedge1 = _mm_sub_ss(_mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5099                                 attribedge2 = _mm_sub_ss(_mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5100                                 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5101                                 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5102                                 attriborigin = _mm_sub_ss(attriborigin, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5103                                 cliporigin = _mm_cvtss_f32(attriborigin)*thread->fb_clipplane[2] + thread->fb_clipplane[3];
5104                                 clipxslope = thread->fb_clipplane[0] + _mm_cvtss_f32(attribxslope)*thread->fb_clipplane[2];
5105                                 clipyslope = thread->fb_clipplane[1] + _mm_cvtss_f32(attribyslope)*thread->fb_clipplane[2];
5106                                 if(clipxslope != 0)
5107                                 {
5108                                         clip0origin = -cliporigin/clipxslope;
5109                                         clip0slope = -clipyslope/clipxslope;
5110                                         clip0dir = clipxslope > 0 ? 1 : -1;
5111                                 }
5112                                 else if(clipyslope > 0)
5113                                 {
5114                                         clip0origin = dpsoftrast.fb_width*floor(cliporigin/clipyslope);
5115                                         clip0slope = dpsoftrast.fb_width;
5116                                         clip0dir = -1;
5117                                 }
5118                                 else if(clipyslope < 0)
5119                                 {
5120                                         clip0origin = dpsoftrast.fb_width*ceil(cliporigin/clipyslope);
5121                                         clip0slope = -dpsoftrast.fb_width;
5122                                         clip0dir = -1;
5123                                 }
5124                                 else if(clip0origin < 0) continue;
5125                         }
5126
5127                         mipedgescale = _mm_setzero_ps();
5128                         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
5129                         {
5130                                 __m128 attrib0, attrib1, attrib2;
5131                                 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
5132                                 if (k >= DPSOFTRAST_ARRAY_TOTAL)
5133                                         break;
5134                                 arrays += numvertices*4;
5135                                 GENATTRIBS(attrib0, attrib1, attrib2);
5136                                 attriborigin = _mm_mul_ps(attrib1, w1);
5137                                 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
5138                                 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
5139                                 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
5140                                 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
5141                                 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
5142                                 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
5143                                 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
5144                                 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
5145                                 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
5146                                 {
5147                                         mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
5148                                         mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
5149                                         mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
5150                                         mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
5151                                 }
5152                         }
5153
5154                         memset(triangle->mip, 0, sizeof(triangle->mip));
5155                         for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
5156                         {
5157                                 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
5158                                 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
5159                                         break;
5160                                 texture = thread->texbound[texunit];
5161                                 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
5162                                 {
5163                                         mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
5164                                         mipdensity = _mm_mul_ps(mipdensity, mipdensity);
5165                                         mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
5166                                         mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
5167                                         // this will be multiplied in the texturing routine by the texture resolution
5168                                         y = _mm_cvtss_si32(mipdensity);
5169                                         if (y > 0)
5170                                         {
5171                                                 y = (int)(log((float)y)*0.5f/M_LN2);
5172                                                 if (y > texture->mipmaps - 1)
5173                                                         y = texture->mipmaps - 1;
5174                                                 triangle->mip[texunit] = y;
5175                                         }
5176                                 }
5177                         }
5178                 }
5179         
5180                 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
5181                 for (; y < bandy;)
5182                 {
5183                         __m128 xcoords, xslope;
5184                         __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
5185                         int yccmask = _mm_movemask_epi8(ycc);
5186                         int edge0p, edge0n, edge1p, edge1n;
5187                         int nexty;
5188                         float w, wslope;
5189                         float clip0;
5190                         if (numpoints == 4)
5191                         {
5192                                 switch(yccmask)
5193                                 {
5194                                 default:
5195                                 case 0xFFFF: /*0000*/ y = endy; continue;
5196                                 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
5197                                 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5198                                 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
5199                                 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
5200                                 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5201                                 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5202                                 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5203                                 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5204                                 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5205                                 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5206                                 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5207                                 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5208                                 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5209                                 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5210                                 case 0x0000: /*1111*/ y++; continue;
5211                                 }
5212                         }
5213                         else
5214                         {
5215                                 switch(yccmask)
5216                                 {
5217                                 default:
5218                                 case 0xFFFF: /*000*/ y = endy; continue;
5219                                 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5220                                 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5221                                 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5222                                 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5223                                 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5224                                 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5225                                 case 0x0000: /*111*/ y++; continue;
5226                                 }
5227                         }
5228                         ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5229                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5230                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5231                         nexty = _mm_extract_epi16(ycc, 0);
5232                         if (nexty >= bandy) nexty = bandy-1;
5233                         xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5234                         xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5235                         xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5236                                                                 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5237                         xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5238                         if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5239                         {
5240                                 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5241                                 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5242                         }
5243                         clip0 = clip0origin + (y+0.5f)*clip0slope + 0.5f;
5244                         for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope), clip0 += clip0slope)
5245                         {
5246                                 int startx, endx, offset;
5247                                 startx = _mm_cvtss_si32(xcoords);
5248                                 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5249                                 if (startx < minx) startx = minx;
5250                                 if (endx > maxx) endx = maxx;
5251                                 if (startx >= endx) continue;
5252
5253                                 if (clip0dir)
5254                                 {
5255                                         if (clip0dir > 0)
5256                                         {
5257                                                 if (startx < clip0) 
5258                                                 {
5259                                                         if(endx <= clip0) continue;
5260                                                         startx = (int)clip0;
5261                                                 }
5262                                         }
5263                                         else if (endx > clip0) 
5264                                         {
5265                                                 if(startx >= clip0) continue;
5266                                                 endx = (int)clip0;
5267                                         }
5268                                 }
5269                                                 
5270                                 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5271                                 {
5272                                         DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5273                                         span->triangle = thread->numtriangles;
5274                                         span->x = offset;
5275                                         span->y = y;
5276                                         span->startx = 0;
5277                                         span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5278                                         if (span->startx >= span->endx)
5279                                                 continue;
5280                                         wslope = triangle->w[0];
5281                                         w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
5282                                         span->depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
5283                                         span->depthbase = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
5284                                         if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5285                                                 DPSOFTRAST_Draw_ProcessSpans(thread);
5286                                 }
5287                         }
5288                 }
5289
5290                 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5291                 {
5292                         DPSOFTRAST_Draw_ProcessSpans(thread);
5293                         thread->numtriangles = 0;
5294                 }
5295         }
5296
5297         if (!ATOMIC_DECREMENT(command->refcount))
5298         {
5299                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5300                         MM_FREE(command->arrays);
5301         }
5302
5303         if (thread->numspans > 0 || thread->numtriangles > 0)
5304         {
5305                 DPSOFTRAST_Draw_ProcessSpans(thread);
5306                 thread->numtriangles = 0;
5307         }
5308 #endif
5309 }
5310
5311 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5312 {
5313         int i;
5314         int j;
5315         int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5316         int datasize = 2*numvertices*sizeof(float[4]);
5317         DPSOFTRAST_Command_Draw *command;
5318         unsigned char *data;
5319         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5320         {
5321                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5322                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5323                         break;
5324                 datasize += numvertices*sizeof(float[4]);
5325         }
5326         if (element3s)
5327                 datasize += numtriangles*sizeof(unsigned short[3]);
5328         else if (element3i)
5329                 datasize += numtriangles*sizeof(int[3]);
5330         datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5331         if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5332         {
5333                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5334                 data = (unsigned char *)MM_CALLOC(datasize, 1);
5335         }
5336         else
5337         {
5338                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5339                 data = (unsigned char *)command + commandsize;
5340         }
5341         command->firstvertex = firstvertex;
5342         command->numvertices = numvertices;
5343         command->numtriangles = numtriangles;
5344         command->arrays = (float *)data;
5345         memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5346         dpsoftrast.firstvertex = firstvertex;
5347         dpsoftrast.numvertices = numvertices;
5348         dpsoftrast.screencoord4f = (float *)data;
5349         data += numvertices*sizeof(float[4]);
5350         dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5351         data += numvertices*sizeof(float[4]);
5352         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5353         {
5354                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5355                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5356                         break;
5357                 dpsoftrast.post_array4f[j] = (float *)data;
5358                 data += numvertices*sizeof(float[4]);
5359         }
5360         command->element3i = NULL;
5361         command->element3s = NULL;
5362         if (element3s)
5363         {
5364                 command->element3s = (unsigned short *)data;
5365                 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5366         }
5367         else if (element3i)
5368         {
5369                 command->element3i = (int *)data;
5370                 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5371         }
5372         return command;
5373 }
5374
5375 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5376 {
5377         DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5378         DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5379         command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5380         command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5381         if (command->starty >= command->endy)
5382         {
5383                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5384                         MM_FREE(command->arrays);
5385                 DPSOFTRAST_UndoCommand(command->commandsize);
5386                 return;
5387         }
5388         command->clipped = dpsoftrast.drawclipped;
5389         command->refcount = dpsoftrast.numthreads;
5390
5391         if (dpsoftrast.usethreads)
5392         {
5393                 int i;
5394                 DPSOFTRAST_Draw_SyncCommands();
5395                 for (i = 0; i < dpsoftrast.numthreads; i++)
5396                 {
5397                         DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5398                         if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5399                                 Thread_CondSignal(thread->drawcond);
5400                 }
5401         }
5402         else
5403         {
5404                 DPSOFTRAST_Draw_FlushThreads();
5405         }
5406 }
5407
5408 DEFCOMMAND(23, SetRenderTargets, int width; int height;);
5409 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5410 {
5411         thread->validate |= DPSOFTRAST_VALIDATE_FB;
5412 }
5413 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5414 {
5415         DPSOFTRAST_Command_SetRenderTargets *command;
5416         if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5417                 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5418                 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5419                 DPSOFTRAST_Flush();
5420         dpsoftrast.fb_width = width;
5421         dpsoftrast.fb_height = height;
5422         dpsoftrast.fb_depthpixels = depthpixels;
5423         dpsoftrast.fb_colorpixels[0] = colorpixels0;
5424         dpsoftrast.fb_colorpixels[1] = colorpixels1;
5425         dpsoftrast.fb_colorpixels[2] = colorpixels2;
5426         dpsoftrast.fb_colorpixels[3] = colorpixels3;
5427         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5428         command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5429         command->width = width;
5430         command->height = height;
5431 }
5432  
5433 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5434 {
5435         int commandoffset = thread->commandoffset;
5436         while (commandoffset != endoffset)
5437         {
5438                 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5439                 switch (command->opcode)
5440                 {
5441 #define INTERPCOMMAND(name) \
5442                 case DPSOFTRAST_OPCODE_##name : \
5443                         DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5444                         commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5445                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5446                                 commandoffset = 0; \
5447                         break;
5448                 INTERPCOMMAND(Viewport)
5449                 INTERPCOMMAND(ClearColor)
5450                 INTERPCOMMAND(ClearDepth)
5451                 INTERPCOMMAND(ColorMask)
5452                 INTERPCOMMAND(DepthTest)
5453                 INTERPCOMMAND(ScissorTest)
5454                 INTERPCOMMAND(Scissor)
5455                 INTERPCOMMAND(BlendFunc)
5456                 INTERPCOMMAND(BlendSubtract)
5457                 INTERPCOMMAND(DepthMask)
5458                 INTERPCOMMAND(DepthFunc)
5459                 INTERPCOMMAND(DepthRange)
5460                 INTERPCOMMAND(PolygonOffset)
5461                 INTERPCOMMAND(CullFace)
5462                 INTERPCOMMAND(SetTexture)
5463                 INTERPCOMMAND(SetShader)
5464                 INTERPCOMMAND(Uniform4f)
5465                 INTERPCOMMAND(UniformMatrix4f)
5466                 INTERPCOMMAND(Uniform1i)
5467                 INTERPCOMMAND(SetRenderTargets)
5468                 INTERPCOMMAND(ClipPlane)
5469
5470                 case DPSOFTRAST_OPCODE_Draw:
5471                         DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5472                         commandoffset += command->commandsize;
5473                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5474                                 commandoffset = 0;
5475                         thread->commandoffset = commandoffset;
5476                         break;
5477
5478                 case DPSOFTRAST_OPCODE_Reset:
5479                         commandoffset = 0;
5480                         break;
5481                 }
5482         }
5483         thread->commandoffset = commandoffset;
5484 }
5485
5486 static int DPSOFTRAST_Draw_Thread(void *data)
5487 {
5488         DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5489         while(thread->index >= 0)
5490         {
5491                 if (thread->commandoffset != dpsoftrast.drawcommand)
5492                 {
5493                         DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);      
5494                 }
5495                 else 
5496                 {
5497                         Thread_LockMutex(thread->drawmutex);
5498                         if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5499                         {
5500                                 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5501                                 thread->starving = true;
5502                                 Thread_CondWait(thread->drawcond, thread->drawmutex);
5503                                 thread->starving = false;
5504                         }
5505                         Thread_UnlockMutex(thread->drawmutex);
5506                 }
5507         }   
5508         return 0;
5509 }
5510
5511 static void DPSOFTRAST_Draw_FlushThreads(void)
5512 {
5513         DPSOFTRAST_State_Thread *thread;
5514         int i;
5515         DPSOFTRAST_Draw_SyncCommands();
5516         if (dpsoftrast.usethreads) 
5517         {
5518                 for (i = 0; i < dpsoftrast.numthreads; i++)
5519                 {
5520                         thread = &dpsoftrast.threads[i];
5521                         if (thread->commandoffset != dpsoftrast.drawcommand)
5522                         {
5523                                 Thread_LockMutex(thread->drawmutex);
5524                                 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5525                                         Thread_CondSignal(thread->drawcond);
5526                                 Thread_UnlockMutex(thread->drawmutex);
5527                         }
5528                 }
5529                 for (i = 0; i < dpsoftrast.numthreads; i++)
5530                 {
5531                         thread = &dpsoftrast.threads[i];
5532                         if (thread->commandoffset != dpsoftrast.drawcommand)
5533                         {
5534                                 Thread_LockMutex(thread->drawmutex);
5535                                 if (thread->commandoffset != dpsoftrast.drawcommand)
5536                                 {
5537                                         thread->waiting = true;
5538                                         Thread_CondWait(thread->waitcond, thread->drawmutex);
5539                                         thread->waiting = false;
5540                                 }
5541                                 Thread_UnlockMutex(thread->drawmutex);
5542                         }
5543                 }
5544         }
5545         else
5546         {
5547                 for (i = 0; i < dpsoftrast.numthreads; i++)
5548                 {
5549                         thread = &dpsoftrast.threads[i];
5550                         if (thread->commandoffset != dpsoftrast.drawcommand)
5551                                 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5552                 }
5553         }
5554         dpsoftrast.commandpool.usedcommands = 0;
5555 }
5556
5557 void DPSOFTRAST_Flush(void)
5558 {
5559         DPSOFTRAST_Draw_FlushThreads();
5560 }
5561
5562 void DPSOFTRAST_Finish(void)
5563 {
5564         DPSOFTRAST_Flush();
5565 }
5566
5567 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5568 {
5569         int i;
5570         union
5571         {
5572                 int i;
5573                 unsigned char b[4];
5574         }
5575         u;
5576         u.i = 1;
5577         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5578         dpsoftrast.bigendian = u.b[3];
5579         dpsoftrast.fb_width = width;
5580         dpsoftrast.fb_height = height;
5581         dpsoftrast.fb_depthpixels = depthpixels;
5582         dpsoftrast.fb_colorpixels[0] = colorpixels;
5583         dpsoftrast.fb_colorpixels[1] = NULL;
5584         dpsoftrast.fb_colorpixels[1] = NULL;
5585         dpsoftrast.fb_colorpixels[1] = NULL;
5586         dpsoftrast.viewport[0] = 0;
5587         dpsoftrast.viewport[1] = 0;
5588         dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5589         dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5590         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5591         dpsoftrast.texture_firstfree = 1;
5592         dpsoftrast.texture_end = 1;
5593         dpsoftrast.texture_max = 0;
5594         dpsoftrast.color[0] = 1;
5595         dpsoftrast.color[1] = 1;
5596         dpsoftrast.color[2] = 1;
5597         dpsoftrast.color[3] = 1;
5598         dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5599         dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5600         dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5601         dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5602         for (i = 0; i < dpsoftrast.numthreads; i++)
5603         {
5604                 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5605                 thread->index = i;
5606                 thread->cullface = GL_BACK;
5607         thread->colormask[0] = 1; 
5608                 thread->colormask[1] = 1;
5609                 thread->colormask[2] = 1;
5610                 thread->colormask[3] = 1;
5611                 thread->blendfunc[0] = GL_ONE;
5612                 thread->blendfunc[1] = GL_ZERO;
5613                 thread->depthmask = true;
5614                 thread->depthtest = true;
5615                 thread->depthfunc = GL_LEQUAL;
5616                 thread->scissortest = false;
5617                 thread->viewport[0] = 0;
5618                 thread->viewport[1] = 0;
5619                 thread->viewport[2] = dpsoftrast.fb_width;
5620                 thread->viewport[3] = dpsoftrast.fb_height;
5621                 thread->scissor[0] = 0;
5622                 thread->scissor[1] = 0;
5623                 thread->scissor[2] = dpsoftrast.fb_width;
5624                 thread->scissor[3] = dpsoftrast.fb_height;
5625                 thread->depthrange[0] = 0;
5626                 thread->depthrange[1] = 1;
5627                 thread->polygonoffset[0] = 0;
5628                 thread->polygonoffset[1] = 0;
5629                 thread->clipplane[0] = 0;
5630                 thread->clipplane[1] = 0;
5631                 thread->clipplane[2] = 0;
5632                 thread->clipplane[3] = 1;
5633         
5634                 thread->numspans = 0;
5635                 thread->numtriangles = 0;
5636                 thread->commandoffset = 0;
5637                 thread->waiting = false;
5638                 thread->starving = false;
5639            
5640                 thread->validate = -1;
5641                 DPSOFTRAST_Validate(thread, -1);
5642  
5643                 if (dpsoftrast.usethreads)
5644                 {
5645                         thread->waitcond = Thread_CreateCond();
5646                         thread->drawcond = Thread_CreateCond();
5647                         thread->drawmutex = Thread_CreateMutex();
5648                         thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5649                 }
5650         }
5651         return 0;
5652 }
5653
5654 void DPSOFTRAST_Shutdown(void)
5655 {
5656         int i;
5657         if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5658         {
5659                 DPSOFTRAST_State_Thread *thread;
5660                 for (i = 0; i < dpsoftrast.numthreads; i++)
5661                 {
5662                         thread = &dpsoftrast.threads[i];
5663                         Thread_LockMutex(thread->drawmutex);
5664                         thread->index = -1;
5665                         Thread_CondSignal(thread->drawcond);
5666                         Thread_UnlockMutex(thread->drawmutex);
5667                         Thread_WaitThread(thread->thread, 0);
5668                         Thread_DestroyCond(thread->waitcond);
5669                         Thread_DestroyCond(thread->drawcond);
5670                         Thread_DestroyMutex(thread->drawmutex);
5671                 }
5672         }
5673         for (i = 0;i < dpsoftrast.texture_end;i++)
5674                 if (dpsoftrast.texture[i].bytes)
5675                         MM_FREE(dpsoftrast.texture[i].bytes);
5676         if (dpsoftrast.texture)
5677                 free(dpsoftrast.texture);
5678         if (dpsoftrast.threads)
5679                 MM_FREE(dpsoftrast.threads);
5680         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5681 }
5682