]> git.xonotic.org Git - xonotic/darkplaces.git/blob - dpsoftrast.c
added simple affine check to accelerate texture fetches on 2D art
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "thread.h"
7 #include "dpsoftrast.h"
8
9 #ifdef _MSC_VER
10 #pragma warning(disable : 4324)
11 #endif
12
13 #ifndef __cplusplus
14 typedef qboolean bool;
15 #endif
16
17 #define ALIGN_SIZE 16
18 #define ATOMIC_SIZE 32
19
20 #ifdef SSE_POSSIBLE
21         #if defined(__APPLE__)
22                 #include <libkern/OSAtomic.h>
23                 #define ALIGN(var) var __attribute__((__aligned__(16)))
24                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
25                 #define MEMORY_BARRIER (_mm_sfence())
26                 #define ATOMIC_COUNTER volatile int32_t 
27                 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28                 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29                 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30         #elif defined(__GNUC__)
31                 #define ALIGN(var) var __attribute__((__aligned__(16)))
32                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
33                 #define MEMORY_BARRIER (_mm_sfence())
34                 //(__sync_synchronize())
35                 #define ATOMIC_COUNTER volatile int
36                 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
37                 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
38                 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
39         #elif defined(_MSC_VER)
40                 #define ALIGN(var) __declspec(align(16)) var
41                 #define ATOMIC(var) __declspec(align(32)) var
42                 #define MEMORY_BARRIER (_mm_sfence())
43                 //(MemoryBarrier())
44                 #define ATOMIC_COUNTER volatile LONG
45                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
46                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
47                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
48         #endif
49 #endif
50
51 #ifndef ALIGN
52 #define ALIGN(var) var
53 #endif
54 #ifndef ATOMIC
55 #define ATOMIC(var) var
56 #endif
57 #ifndef MEMORY_BARRIER
58 #define MEMORY_BARRIER ((void)0)
59 #endif
60 #ifndef ATOMIC_COUNTER
61 #define ATOMIC_COUNTER int
62 #endif
63 #ifndef ATOMIC_INCREMENT
64 #define ATOMIC_INCREMENT(counter) (++(counter))
65 #endif
66 #ifndef ATOMIC_DECREMENT
67 #define ATOMIC_DECREMENT(counter) (--(counter))
68 #endif
69 #ifndef ATOMIC_ADD
70 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
71 #endif
72
73 #ifdef SSE_POSSIBLE
74 #include <emmintrin.h>
75
76 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
77
78 static void *MM_CALLOC(size_t nmemb, size_t size)
79 {
80         void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
81         if (ptr != NULL) memset(ptr, 0, nmemb*size);
82         return ptr;
83 }
84
85 #define MM_FREE _mm_free
86 #else
87 #define MM_MALLOC(size) malloc(size)
88 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
89 #define MM_FREE free
90 #endif
91
92 typedef enum DPSOFTRAST_ARRAY_e
93 {
94         DPSOFTRAST_ARRAY_POSITION,
95         DPSOFTRAST_ARRAY_COLOR,
96         DPSOFTRAST_ARRAY_TEXCOORD0,
97         DPSOFTRAST_ARRAY_TEXCOORD1,
98         DPSOFTRAST_ARRAY_TEXCOORD2,
99         DPSOFTRAST_ARRAY_TEXCOORD3,
100         DPSOFTRAST_ARRAY_TEXCOORD4,
101         DPSOFTRAST_ARRAY_TEXCOORD5,
102         DPSOFTRAST_ARRAY_TEXCOORD6,
103         DPSOFTRAST_ARRAY_TEXCOORD7,
104         DPSOFTRAST_ARRAY_TOTAL
105 }
106 DPSOFTRAST_ARRAY;
107
108 typedef struct DPSOFTRAST_Texture_s
109 {
110         int flags;
111         int width;
112         int height;
113         int depth;
114         int sides;
115         DPSOFTRAST_TEXTURE_FILTER filter;
116         int mipmaps;
117         int size;
118         ATOMIC_COUNTER binds;
119         unsigned char *bytes;
120         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
121 }
122 DPSOFTRAST_Texture;
123
124 #define COMMAND_SIZE ALIGN_SIZE
125 #define COMMAND_ALIGN(var) ALIGN(var)
126
127 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
128 {
129         unsigned char opcode;
130         unsigned short commandsize;
131 }
132 DPSOFTRAST_Command);
133
134 enum { DPSOFTRAST_OPCODE_Reset = 0 };
135
136 #define DEFCOMMAND(opcodeval, name, fields) \
137         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
138         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
139         { \
140                 unsigned char opcode; \
141                 unsigned short commandsize; \
142                 fields \
143         } DPSOFTRAST_Command_##name );
144
145 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
146 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
147
148 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
149 {
150         int freecommand;
151         int usedcommands;
152         ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
153 }
154 DPSOFTRAST_State_Command_Pool);
155
156 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
157 {
158         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
159         float w[3];
160         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
161 }
162 DPSOFTRAST_State_Triangle);
163
164 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
165         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
166         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
167                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
168                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
169 }
170 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
171         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
172         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
173         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
174         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
175         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
176         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
177         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
178         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
179 }
180                                         
181 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
182
183 typedef ALIGN(struct DPSOFTRAST_State_Span_s
184 {
185         int triangle; // triangle this span was generated by
186         int x; // framebuffer x coord
187         int y; // framebuffer y coord
188         int startx; // usable range (according to pixelmask)
189         int endx; // usable range (according to pixelmask)
190         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
191 }
192 DPSOFTRAST_State_Span);
193
194 #define DPSOFTRAST_DRAW_MAXSPANS 1024
195 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
196
197 #define DPSOFTRAST_VALIDATE_FB 1
198 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
199 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
200 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
201
202 typedef enum DPSOFTRAST_BLENDMODE_e
203 {
204         DPSOFTRAST_BLENDMODE_OPAQUE,
205         DPSOFTRAST_BLENDMODE_ALPHA,
206         DPSOFTRAST_BLENDMODE_ADDALPHA,
207         DPSOFTRAST_BLENDMODE_ADD,
208         DPSOFTRAST_BLENDMODE_INVMOD,
209         DPSOFTRAST_BLENDMODE_MUL,
210         DPSOFTRAST_BLENDMODE_MUL2,
211         DPSOFTRAST_BLENDMODE_SUBALPHA,
212         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
213         DPSOFTRAST_BLENDMODE_INVADD,
214         DPSOFTRAST_BLENDMODE_TOTAL
215 }
216 DPSOFTRAST_BLENDMODE;
217
218 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
219 {
220         void *thread;
221         int index;
222         
223         int cullface;
224         int colormask[4];
225         int blendfunc[2];
226         int blendsubtract;
227         int depthmask;
228         int depthtest;
229         int depthfunc;
230         int scissortest;
231         int alphatest;
232         int alphafunc;
233         float alphavalue;
234         int viewport[4];
235         int scissor[4];
236         float depthrange[2];
237         float polygonoffset[2];
238         ALIGN(float clipplane[4]);
239
240         int shader_mode;
241         int shader_permutation;
242         int shader_exactspecularmath;
243
244         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
245         
246         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
247         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
248
249         // DPSOFTRAST_VALIDATE_ flags
250         int validate;
251
252         // derived values (DPSOFTRAST_VALIDATE_FB)
253         int fb_colormask;
254         int fb_scissor[4];
255         ALIGN(float fb_viewportcenter[4]);
256         ALIGN(float fb_viewportscale[4]);
257
258         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
259         int fb_depthfunc;
260
261         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
262         int fb_blendmode;
263
264         // band boundaries
265         int miny1;
266         int maxy1;
267         int miny2;
268         int maxy2;
269
270         ATOMIC(volatile int commandoffset);
271
272         volatile bool waiting;
273         volatile bool starving;
274         void *waitcond;
275         void *drawcond;
276         void *drawmutex;
277
278         int numspans;
279         int numtriangles;
280         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
281         DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
282 }
283 DPSOFTRAST_State_Thread);
284
285 typedef ATOMIC(struct DPSOFTRAST_State_s
286 {
287         int fb_width;
288         int fb_height;
289         unsigned int *fb_depthpixels;
290         unsigned int *fb_colorpixels[4];
291
292         int viewport[4];
293         ALIGN(float fb_viewportcenter[4]);
294         ALIGN(float fb_viewportscale[4]);
295
296         float color[4];
297         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
298         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
299
300         const float *pointer_vertex3f;
301         const float *pointer_color4f;
302         const unsigned char *pointer_color4ub;
303         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
304         int stride_vertex;
305         int stride_color;
306         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
307         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
308         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
309
310         int firstvertex;
311         int numvertices;
312         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
313         float *screencoord4f;
314         int drawstarty;
315         int drawendy;
316         int drawclipped;
317         
318         int shader_mode;
319         int shader_permutation;
320         int shader_exactspecularmath;
321
322         int texture_max;
323         int texture_end;
324         int texture_firstfree;
325         DPSOFTRAST_Texture *texture;
326
327         int bigendian;
328
329         // error reporting
330         const char *errorstring;
331
332         bool usethreads;
333         int interlace;
334         int numthreads;
335         DPSOFTRAST_State_Thread *threads;
336
337         ATOMIC(volatile int drawcommand);
338
339         DPSOFTRAST_State_Command_Pool commandpool;
340 }
341 DPSOFTRAST_State);
342
343 DPSOFTRAST_State dpsoftrast;
344
345 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
346 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
347 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
348 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
349 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
350
351 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
352 {
353         fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
354         fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
355         fb_viewportcenter[3] = 0.5f;
356         fb_viewportcenter[0] = 0.0f;
357         fb_viewportscale[1] = 0.5f * viewport[2];
358         fb_viewportscale[2] = -0.5f * viewport[3];
359         fb_viewportscale[3] = 0.5f;
360         fb_viewportscale[0] = 1.0f;
361 }
362
363 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
364 {
365         if (dpsoftrast.interlace)
366         {
367                 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
368                 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
369                 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
370                 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
371         }
372         else
373         {
374                 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
375                 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
376         }
377 }
378
379 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
380 {
381         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
382         // and viewport projection values
383         int x1, x2;
384         int y1, y2;
385         x1 = thread->scissor[0];
386         x2 = thread->scissor[0] + thread->scissor[2];
387         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
388         y2 = dpsoftrast.fb_height - thread->scissor[1];
389         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
390         if (x1 < 0) x1 = 0;
391         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
392         if (y1 < 0) y1 = 0;
393         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
394         thread->fb_scissor[0] = x1;
395         thread->fb_scissor[1] = y1;
396         thread->fb_scissor[2] = x2 - x1;
397         thread->fb_scissor[3] = y2 - y1;
398
399         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
400         DPSOFTRAST_RecalcThread(thread);
401 }
402
403 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
404 {
405         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
406 }
407
408 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
409 {
410         if (thread->blendsubtract)
411         {
412                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
413                 {
414                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
415                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
416                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
417                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
418                 }
419         }
420         else
421         {       
422                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
423                 {
424                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
425                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
426                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
427                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
428                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
429                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
430                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
431                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
432                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
433                 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
434                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
435                 }
436         }
437 }
438
439 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
440
441 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
442 {
443         mask &= thread->validate;
444         if (!mask)
445                 return;
446         if (mask & DPSOFTRAST_VALIDATE_FB)
447         {
448                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
449                 DPSOFTRAST_RecalcFB(thread);
450         }
451         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
452         {
453                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
454                 DPSOFTRAST_RecalcDepthFunc(thread);
455         }
456         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
457         {
458                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
459                 DPSOFTRAST_RecalcBlendFunc(thread);
460         }
461 }
462
463 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
464 {
465         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
466                 return &dpsoftrast.texture[index];
467         return NULL;
468 }
469
470 static void DPSOFTRAST_Texture_Grow(void)
471 {
472         DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
473         DPSOFTRAST_State_Thread *thread;
474         int i;
475         int j;
476         DPSOFTRAST_Flush();
477         // expand texture array as needed
478         if (dpsoftrast.texture_max < 1024)
479                 dpsoftrast.texture_max = 1024;
480         else
481                 dpsoftrast.texture_max *= 2;
482         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
483         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
484                 if (dpsoftrast.texbound[i])
485                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
486         for (j = 0; j < dpsoftrast.numthreads; j++)
487         {
488                 thread = &dpsoftrast.threads[j];
489                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
490                         if (thread->texbound[i])
491                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
492         }
493 }
494
495 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
496 {
497         int w;
498         int h;
499         int d;
500         int size;
501         int s;
502         int texnum;
503         int mipmaps;
504         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
505         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
506         DPSOFTRAST_Texture *texture;
507         if (width*height*depth < 1)
508         {
509                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
510                 return 0;
511         }
512         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
513         {
514                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
515                 return 0;
516         }
517         switch(texformat)
518         {
519         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
520         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
521         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
522                 break;
523         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
524                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
525                 {
526                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
527                         return 0;
528                 }
529                 if (depth != 1)
530                 {
531                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
532                         return 0;
533                 }
534                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
535                 {
536                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
537                         return 0;
538                 }
539                 break;
540         }
541         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
542         {
543                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
544                 return 0;
545         }
546         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
547         {
548                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
549                 return 0;
550         }
551         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
552         {
553                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
554                 return 0;
555         }
556         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
557         {
558                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
559                 return 0;
560         }
561         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
562         {
563                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
564                 return 0;
565         }
566         // find first empty slot in texture array
567         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
568                 if (!dpsoftrast.texture[texnum].bytes)
569                         break;
570         dpsoftrast.texture_firstfree = texnum + 1;
571         if (dpsoftrast.texture_max <= texnum)
572                 DPSOFTRAST_Texture_Grow();
573         if (dpsoftrast.texture_end <= texnum)
574                 dpsoftrast.texture_end = texnum + 1;
575         texture = &dpsoftrast.texture[texnum];
576         memset(texture, 0, sizeof(*texture));
577         texture->flags = flags;
578         texture->width = width;
579         texture->height = height;
580         texture->depth = depth;
581         texture->sides = sides;
582         texture->binds = 0;
583         w = width;
584         h = height;
585         d = depth;
586         size = 0;
587         mipmaps = 0;
588         w = width;
589         h = height;
590         d = depth;
591         for (;;)
592         {
593                 s = w * h * d * sides * 4;
594                 texture->mipmap[mipmaps][0] = size;
595                 texture->mipmap[mipmaps][1] = s;
596                 texture->mipmap[mipmaps][2] = w;
597                 texture->mipmap[mipmaps][3] = h;
598                 texture->mipmap[mipmaps][4] = d;
599                 size += s;
600                 mipmaps++;
601                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
602                         break;
603                 if (w > 1) w >>= 1;
604                 if (h > 1) h >>= 1;
605                 if (d > 1) d >>= 1;
606         }
607         texture->mipmaps = mipmaps;
608         texture->size = size;
609
610         // allocate the pixels now
611         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
612
613         return texnum;
614 }
615 void DPSOFTRAST_Texture_Free(int index)
616 {
617         DPSOFTRAST_Texture *texture;
618         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
619         if (texture->binds)
620                 DPSOFTRAST_Flush();
621         if (texture->bytes)
622                 MM_FREE(texture->bytes);
623         texture->bytes = NULL;
624         memset(texture, 0, sizeof(*texture));
625         // adjust the free range and used range
626         if (dpsoftrast.texture_firstfree > index)
627                 dpsoftrast.texture_firstfree = index;
628         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
629                 dpsoftrast.texture_end--;
630 }
631 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
632 {
633         int i, x, y, z, w, layer0, layer1, row0, row1;
634         unsigned char *o, *i0, *i1, *i2, *i3;
635         DPSOFTRAST_Texture *texture;
636         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
637         if (texture->mipmaps <= 1)
638                 return;
639         for (i = 1;i < texture->mipmaps;i++)
640         {
641                 for (z = 0;z < texture->mipmap[i][4];z++)
642                 {
643                         layer0 = z*2;
644                         layer1 = z*2+1;
645                         if (layer1 >= texture->mipmap[i-1][4])
646                                 layer1 = texture->mipmap[i-1][4]-1;
647                         for (y = 0;y < texture->mipmap[i][3];y++)
648                         {
649                                 row0 = y*2;
650                                 row1 = y*2+1;
651                                 if (row1 >= texture->mipmap[i-1][3])
652                                         row1 = texture->mipmap[i-1][3]-1;
653                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
654                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
655                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
656                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
657                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
658                                 w = texture->mipmap[i][2];
659                                 if (layer1 > layer0)
660                                 {
661                                         if (texture->mipmap[i-1][2] > 1)
662                                         {
663                                                 // average 3D texture
664                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
665                                                 {
666                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
667                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
668                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
669                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
670                                                 }
671                                         }
672                                         else
673                                         {
674                                                 // average 3D mipmap with parent width == 1
675                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
676                                                 {
677                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
678                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
679                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
680                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
681                                                 }
682                                         }
683                                 }
684                                 else
685                                 {
686                                         if (texture->mipmap[i-1][2] > 1)
687                                         {
688                                                 // average 2D texture (common case)
689                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
690                                                 {
691                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
692                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
693                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
694                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
695                                                 }
696                                         }
697                                         else
698                                         {
699                                                 // 2D texture with parent width == 1
700                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
701                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
702                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
703                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
704                                         }
705                                 }
706                         }
707                 }
708         }
709 }
710 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
711 {
712         DPSOFTRAST_Texture *texture;
713         unsigned char *dst;
714         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
715         if (texture->binds)
716                 DPSOFTRAST_Flush();
717         if (pixels)
718         {
719                 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
720                 while (blockheight > 0)
721                 {
722                         memcpy(dst, pixels, blockwidth * 4);
723                         pixels += blockwidth * 4;
724                         dst += texture->mipmap[0][2] * 4;
725                         blockheight--;
726                 }
727         }
728         DPSOFTRAST_Texture_CalculateMipmaps(index);
729 }
730 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
731 {
732         DPSOFTRAST_Texture *texture;
733         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
734         if (texture->binds)
735                 DPSOFTRAST_Flush();
736         if (pixels)
737                 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
738         DPSOFTRAST_Texture_CalculateMipmaps(index);
739 }
740 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
741 {
742         DPSOFTRAST_Texture *texture;
743         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
744         return texture->mipmap[mip][2];
745 }
746 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
747 {
748         DPSOFTRAST_Texture *texture;
749         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
750         return texture->mipmap[mip][3];
751 }
752 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
753 {
754         DPSOFTRAST_Texture *texture;
755         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
756         return texture->mipmap[mip][4];
757 }
758 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
759 {
760         DPSOFTRAST_Texture *texture;
761         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
762         if (texture->binds)
763                 DPSOFTRAST_Flush();
764         return texture->bytes + texture->mipmap[mip][0];
765 }
766 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
767 {
768         DPSOFTRAST_Texture *texture;
769         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
770         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
771         {
772                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
773                 return;
774         }
775         if (texture->binds)
776                 DPSOFTRAST_Flush();
777         texture->filter = filter;
778 }
779
780 static void DPSOFTRAST_Draw_FlushThreads(void);
781
782 static void DPSOFTRAST_Draw_SyncCommands(void)
783 {
784         if(dpsoftrast.usethreads) MEMORY_BARRIER;
785         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
786 }
787
788 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
789 {
790         DPSOFTRAST_State_Thread *thread;
791         int i;
792         int freecommand = dpsoftrast.commandpool.freecommand;
793         int usedcommands = dpsoftrast.commandpool.usedcommands;
794         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
795                 return;
796         DPSOFTRAST_Draw_SyncCommands();
797         for(;;)
798         {
799                 int waitindex = -1;
800                 int commandoffset;
801                 usedcommands = 0;
802                 for (i = 0; i < dpsoftrast.numthreads; i++)
803                 {
804                         thread = &dpsoftrast.threads[i]; 
805                         commandoffset = freecommand - thread->commandoffset;
806                         if (commandoffset < 0)
807                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
808                         if (commandoffset > usedcommands)
809                         {
810                                 waitindex = i;
811                                 usedcommands = commandoffset;
812                         }
813                 }
814                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
815                         break;
816                 thread = &dpsoftrast.threads[waitindex];
817                 Thread_LockMutex(thread->drawmutex);
818                 if (thread->commandoffset != dpsoftrast.drawcommand)
819                 {
820                         thread->waiting = true;
821                         if (thread->starving) Thread_CondSignal(thread->drawcond);
822                         Thread_CondWait(thread->waitcond, thread->drawmutex);
823                         thread->waiting = false;
824                 }
825                 Thread_UnlockMutex(thread->drawmutex);
826         }
827         dpsoftrast.commandpool.usedcommands = usedcommands;
828 }
829
830 #define DPSOFTRAST_ALIGNCOMMAND(size) \
831         ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
832 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
833         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
834
835 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
836 {
837         DPSOFTRAST_Command *command;
838         int freecommand = dpsoftrast.commandpool.freecommand;
839         int usedcommands = dpsoftrast.commandpool.usedcommands;
840         int extra = sizeof(DPSOFTRAST_Command);
841         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
842                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
843         if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
844         {
845                 if (dpsoftrast.usethreads)
846                         DPSOFTRAST_Draw_FreeCommandPool(size + extra);
847                 else
848                         DPSOFTRAST_Draw_FlushThreads();
849                 freecommand = dpsoftrast.commandpool.freecommand;
850                 usedcommands = dpsoftrast.commandpool.usedcommands;
851         }
852         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
853         {
854                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
855                 command->opcode = DPSOFTRAST_OPCODE_Reset;
856                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
857                 freecommand = 0;
858         }
859         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
860         command->opcode = opcode;
861         command->commandsize = size;
862         freecommand += size;
863         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
864                 freecommand = 0;
865         dpsoftrast.commandpool.freecommand = freecommand;
866         dpsoftrast.commandpool.usedcommands = usedcommands + size;
867         return command;
868 }
869
870 static void DPSOFTRAST_UndoCommand(int size)
871 {
872         int freecommand = dpsoftrast.commandpool.freecommand;
873         int usedcommands = dpsoftrast.commandpool.usedcommands;
874         freecommand -= size;
875         if (freecommand < 0)
876                 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
877         usedcommands -= size;
878         dpsoftrast.commandpool.freecommand = freecommand;
879         dpsoftrast.commandpool.usedcommands = usedcommands;
880 }
881                 
882 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
883 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
884 {
885         thread->viewport[0] = command->x;
886         thread->viewport[1] = command->y;
887         thread->viewport[2] = command->width;
888         thread->viewport[3] = command->height;
889         thread->validate |= DPSOFTRAST_VALIDATE_FB;
890 }
891 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
892 {
893         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
894         command->x = x;
895         command->y = y;
896         command->width = width;
897         command->height = height;
898
899         dpsoftrast.viewport[0] = x;
900         dpsoftrast.viewport[1] = y;
901         dpsoftrast.viewport[2] = width;
902         dpsoftrast.viewport[3] = height;
903         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
904 }
905
906 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
907 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
908 {
909         int i, x1, y1, x2, y2, w, h, x, y;
910         int miny1, maxy1, miny2, maxy2;
911         int bandy;
912         unsigned int *p;
913         unsigned int c;
914         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
915         miny1 = thread->miny1;
916         maxy1 = thread->maxy1;
917         miny2 = thread->miny2;
918         maxy2 = thread->maxy2;
919         x1 = thread->fb_scissor[0];
920         y1 = thread->fb_scissor[1];
921         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
922         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
923         if (y1 < miny1) y1 = miny1;
924         if (y2 > maxy2) y2 = maxy2;
925         w = x2 - x1;
926         h = y2 - y1;
927         if (w < 1 || h < 1)
928                 return;
929         // FIXME: honor fb_colormask?
930         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
931         for (i = 0;i < 4;i++)
932         {
933                 if (!dpsoftrast.fb_colorpixels[i])
934                         continue;
935                 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
936                 for (;y < bandy;y++)
937                 {
938                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
939                         for (x = x1;x < x2;x++)
940                                 p[x] = c;
941                 }
942         }
943 }
944 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
945 {
946         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
947         command->r = r;
948         command->g = g;
949         command->b = b;
950         command->a = a;
951 }
952
953 DEFCOMMAND(3, ClearDepth, float depth;)
954 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
955 {
956         int x1, y1, x2, y2, w, h, x, y;
957         int miny1, maxy1, miny2, maxy2;
958         int bandy;
959         unsigned int *p;
960         unsigned int c;
961         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
962         miny1 = thread->miny1;
963         maxy1 = thread->maxy1;
964         miny2 = thread->miny2;
965         maxy2 = thread->maxy2;
966         x1 = thread->fb_scissor[0];
967         y1 = thread->fb_scissor[1];
968         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
969         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
970         if (y1 < miny1) y1 = miny1;
971         if (y2 > maxy2) y2 = maxy2;
972         w = x2 - x1;
973         h = y2 - y1;
974         if (w < 1 || h < 1)
975                 return;
976         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
977         for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
978         for (;y < bandy;y++)
979         {
980                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
981                 for (x = x1;x < x2;x++)
982                         p[x] = c;
983         }
984 }
985 void DPSOFTRAST_ClearDepth(float d)
986 {
987         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
988         command->depth = d;
989 }
990
991 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
992 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
993 {
994         thread->colormask[0] = command->r != 0;
995         thread->colormask[1] = command->g != 0;
996         thread->colormask[2] = command->b != 0;
997         thread->colormask[3] = command->a != 0;
998         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
999 }
1000 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1001 {
1002         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1003         command->r = r;
1004         command->g = g;
1005         command->b = b;
1006         command->a = a;
1007 }
1008
1009 DEFCOMMAND(5, DepthTest, int enable;)
1010 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1011 {
1012         thread->depthtest = command->enable;
1013         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1014 }
1015 void DPSOFTRAST_DepthTest(int enable)
1016 {
1017         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1018         command->enable = enable;
1019 }
1020
1021 DEFCOMMAND(6, ScissorTest, int enable;)
1022 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1023 {
1024         thread->scissortest = command->enable;
1025         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1026 }
1027 void DPSOFTRAST_ScissorTest(int enable)
1028 {
1029         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1030         command->enable = enable;
1031 }
1032
1033 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1034 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1035 {
1036         thread->scissor[0] = command->x;
1037         thread->scissor[1] = command->y;
1038         thread->scissor[2] = command->width;
1039         thread->scissor[3] = command->height;
1040         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1041 }
1042 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1043 {
1044         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1045         command->x = x;
1046         command->y = y;
1047         command->width = width;
1048         command->height = height;
1049 }
1050
1051 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1052 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1053 {
1054         thread->blendfunc[0] = command->sfactor;
1055         thread->blendfunc[1] = command->dfactor;
1056         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1057 }
1058 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1059 {
1060         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1061         command->sfactor = sfactor;
1062         command->dfactor = dfactor;
1063 }
1064
1065 DEFCOMMAND(9, BlendSubtract, int enable;)
1066 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1067 {
1068         thread->blendsubtract = command->enable;
1069         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1070 }
1071 void DPSOFTRAST_BlendSubtract(int enable)
1072 {
1073         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1074         command->enable = enable;
1075 }
1076
1077 DEFCOMMAND(10, DepthMask, int enable;)
1078 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1079 {
1080         thread->depthmask = command->enable;
1081 }
1082 void DPSOFTRAST_DepthMask(int enable)
1083 {
1084         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1085         command->enable = enable;
1086 }
1087
1088 DEFCOMMAND(11, DepthFunc, int func;)
1089 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1090 {
1091         thread->depthfunc = command->func;
1092 }
1093 void DPSOFTRAST_DepthFunc(int func)
1094 {
1095         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1096         command->func = func;
1097 }
1098
1099 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1100 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1101 {
1102         thread->depthrange[0] = command->nearval;
1103         thread->depthrange[1] = command->farval;
1104 }
1105 void DPSOFTRAST_DepthRange(float nearval, float farval)
1106 {
1107         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1108         command->nearval = nearval;
1109         command->farval = farval;
1110 }
1111
1112 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1113 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1114 {
1115         thread->polygonoffset[0] = command->alongnormal;
1116         thread->polygonoffset[1] = command->intoview;
1117 }
1118 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1119 {
1120         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1121         command->alongnormal = alongnormal;
1122         command->intoview = intoview;
1123 }
1124
1125 DEFCOMMAND(14, CullFace, int mode;)
1126 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1127 {
1128         thread->cullface = command->mode;
1129 }
1130 void DPSOFTRAST_CullFace(int mode)
1131 {
1132         DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1133         command->mode = mode;
1134 }
1135
1136 DEFCOMMAND(15, AlphaTest, int enable;)
1137 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1138 {
1139         thread->alphatest = command->enable;
1140 }
1141 void DPSOFTRAST_AlphaTest(int enable)
1142 {
1143         DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1144         command->enable = enable;
1145 }
1146
1147 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1148 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1149 {
1150         thread->alphafunc = command->func;
1151         thread->alphavalue = command->ref;
1152 }
1153 void DPSOFTRAST_AlphaFunc(int func, float ref)
1154 {
1155         DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1156         command->func = func;
1157         command->ref = ref;
1158 }
1159
1160 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1161 {
1162         dpsoftrast.color[0] = r;
1163         dpsoftrast.color[1] = g;
1164         dpsoftrast.color[2] = b;
1165         dpsoftrast.color[3] = a;
1166 }
1167
1168 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1169 {
1170         int outstride = blockwidth * 4;
1171         int instride = dpsoftrast.fb_width * 4;
1172         int bx1 = blockx;
1173         int by1 = blocky;
1174         int bx2 = blockx + blockwidth;
1175         int by2 = blocky + blockheight;
1176         int bw;
1177         int x;
1178         int y;
1179         unsigned char *inpixels;
1180         unsigned char *b;
1181         unsigned char *o;
1182         DPSOFTRAST_Flush();
1183         if (bx1 < 0) bx1 = 0;
1184         if (by1 < 0) by1 = 0;
1185         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1186         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1187         bw = bx2 - bx1;
1188         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1189         if (dpsoftrast.bigendian)
1190         {
1191                 for (y = by1;y < by2;y++)
1192                 {
1193                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1194                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1195                         for (x = bx1;x < bx2;x++)
1196                         {
1197                                 o[0] = b[3];
1198                                 o[1] = b[2];
1199                                 o[2] = b[1];
1200                                 o[3] = b[0];
1201                                 o += 4;
1202                                 b += 4;
1203                         }
1204                 }
1205         }
1206         else
1207         {
1208                 for (y = by1;y < by2;y++)
1209                 {
1210                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1211                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1212                         memcpy(o, b, bw*4);
1213                 }
1214         }
1215
1216 }
1217 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1218 {
1219         int tx1 = tx;
1220         int ty1 = ty;
1221         int tx2 = tx + width;
1222         int ty2 = ty + height;
1223         int sx1 = sx;
1224         int sy1 = sy;
1225         int sx2 = sx + width;
1226         int sy2 = sy + height;
1227         int swidth;
1228         int sheight;
1229         int twidth;
1230         int theight;
1231         int sw;
1232         int sh;
1233         int tw;
1234         int th;
1235         int y;
1236         unsigned int *spixels;
1237         unsigned int *tpixels;
1238         DPSOFTRAST_Texture *texture;
1239         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1240         if (mip < 0 || mip >= texture->mipmaps) return;
1241         DPSOFTRAST_Flush();
1242         spixels = dpsoftrast.fb_colorpixels[0];
1243         swidth = dpsoftrast.fb_width;
1244         sheight = dpsoftrast.fb_height;
1245         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1246         twidth = texture->mipmap[mip][2];
1247         theight = texture->mipmap[mip][3];
1248         if (tx1 < 0) tx1 = 0;
1249         if (ty1 < 0) ty1 = 0;
1250         if (tx2 > twidth) tx2 = twidth;
1251         if (ty2 > theight) ty2 = theight;
1252         if (sx1 < 0) sx1 = 0;
1253         if (sy1 < 0) sy1 = 0;
1254         if (sx2 > swidth) sx2 = swidth;
1255         if (sy2 > sheight) sy2 = sheight;
1256         tw = tx2 - tx1;
1257         th = ty2 - ty1;
1258         sw = sx2 - sx1;
1259         sh = sy2 - sy1;
1260         if (tw > sw) tw = sw;
1261         if (th > sh) th = sh;
1262         if (tw < 1 || th < 1)
1263                 return;
1264         sy1 = sheight - 1 - sy1;
1265         for (y = 0;y < th;y++)
1266                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1267         if (texture->mipmaps > 1)
1268                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1269 }
1270
1271 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1272 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1273 {
1274         if (thread->texbound[command->unitnum])
1275                 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1276         thread->texbound[command->unitnum] = command->texture;
1277 }
1278 void DPSOFTRAST_SetTexture(int unitnum, int index)
1279 {
1280         DPSOFTRAST_Command_SetTexture *command;
1281         DPSOFTRAST_Texture *texture;
1282         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1283         {
1284                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1285                 return;
1286         }
1287         texture = DPSOFTRAST_Texture_GetByIndex(index);
1288         if (index && !texture)
1289         {
1290                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1291                 return;
1292         }
1293
1294         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1295         command->unitnum = unitnum;
1296         command->texture = texture;
1297
1298         dpsoftrast.texbound[unitnum] = texture;
1299         ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1300 }
1301
1302 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1303 {
1304         dpsoftrast.pointer_vertex3f = vertex3f;
1305         dpsoftrast.stride_vertex = stride;
1306 }
1307 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1308 {
1309         dpsoftrast.pointer_color4f = color4f;
1310         dpsoftrast.pointer_color4ub = NULL;
1311         dpsoftrast.stride_color = stride;
1312 }
1313 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1314 {
1315         dpsoftrast.pointer_color4f = NULL;
1316         dpsoftrast.pointer_color4ub = color4ub;
1317         dpsoftrast.stride_color = stride;
1318 }
1319 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1320 {
1321         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1322         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1323         dpsoftrast.stride_texcoord[unitnum] = stride;
1324 }
1325
1326 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1327 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1328 {
1329         thread->shader_mode = command->mode;
1330         thread->shader_permutation = command->permutation;
1331         thread->shader_exactspecularmath = command->exactspecularmath;
1332 }
1333 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1334 {
1335         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1336         command->mode = mode;
1337         command->permutation = permutation;
1338         command->exactspecularmath = exactspecularmath;
1339
1340         dpsoftrast.shader_mode = mode;
1341         dpsoftrast.shader_permutation = permutation;
1342         dpsoftrast.shader_exactspecularmath = exactspecularmath;
1343 }
1344
1345 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1346 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1347 {
1348         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1349 }
1350 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1351 {
1352         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1353         command->index = index;
1354         command->val[0] = v0;
1355         command->val[1] = v1;
1356         command->val[2] = v2;
1357         command->val[3] = v3;
1358
1359         dpsoftrast.uniform4f[index*4+0] = v0;
1360         dpsoftrast.uniform4f[index*4+1] = v1;
1361         dpsoftrast.uniform4f[index*4+2] = v2;
1362         dpsoftrast.uniform4f[index*4+3] = v3;
1363 }
1364 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1365 {
1366         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1367         command->index = index;
1368         memcpy(command->val, v, sizeof(command->val));
1369
1370         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1371 }
1372
1373 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1374 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1375 {
1376         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1377 }
1378 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1379 {
1380 #ifdef SSE_POSSIBLE
1381         int i, index;
1382         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1383         {
1384                 __m128 m0, m1, m2, m3;
1385                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1386                 command->index = (DPSOFTRAST_UNIFORM)index;
1387                 if (((size_t)v)&(ALIGN_SIZE-1))
1388                 {
1389                         m0 = _mm_loadu_ps(v);
1390                         m1 = _mm_loadu_ps(v+4);
1391                         m2 = _mm_loadu_ps(v+8);
1392                         m3 = _mm_loadu_ps(v+12);
1393                 }
1394                 else
1395                 {
1396                         m0 = _mm_load_ps(v);
1397                         m1 = _mm_load_ps(v+4);
1398                         m2 = _mm_load_ps(v+8);
1399                         m3 = _mm_load_ps(v+12);
1400                 }
1401                 if (transpose)
1402                 {
1403                         __m128 t0, t1, t2, t3;
1404                         t0 = _mm_unpacklo_ps(m0, m1);
1405                         t1 = _mm_unpacklo_ps(m2, m3);
1406                         t2 = _mm_unpackhi_ps(m0, m1);
1407                         t3 = _mm_unpackhi_ps(m2, m3);
1408                         m0 = _mm_movelh_ps(t0, t1);
1409                         m1 = _mm_movehl_ps(t1, t0);
1410                         m2 = _mm_movelh_ps(t2, t3);
1411                         m3 = _mm_movehl_ps(t3, t2);                     
1412                 }
1413                 _mm_store_ps(command->val, m0);
1414                 _mm_store_ps(command->val+4, m1);
1415                 _mm_store_ps(command->val+8, m2);
1416                 _mm_store_ps(command->val+12, m3);
1417                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1418                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1419                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1420                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1421         }
1422 #endif
1423 }
1424
1425 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1426 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1427 {
1428         thread->uniform1i[command->index] = command->val;
1429 }
1430 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1431 {
1432         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1433         command->index = index;
1434         command->val = i0;
1435
1436         dpsoftrast.uniform1i[command->index] = i0;
1437 }
1438
1439 DEFCOMMAND(24, ClipPlane, float clipplane[4];)
1440 static void DPSOFTRAST_Interpret_ClipPlane(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClipPlane *command)
1441 {
1442         memcpy(thread->clipplane, command->clipplane, 4*sizeof(float));
1443 }
1444 void DPSOFTRAST_ClipPlane(float x, float y, float z, float w)
1445 {
1446         DPSOFTRAST_Command_ClipPlane *command = DPSOFTRAST_ALLOCATECOMMAND(ClipPlane);
1447         x /= dpsoftrast.fb_viewportscale[1];
1448         y /= dpsoftrast.fb_viewportscale[2];
1449         z /= dpsoftrast.fb_viewportscale[3];
1450         w /= dpsoftrast.fb_viewportscale[0];
1451         w -= dpsoftrast.fb_viewportcenter[1]*x + dpsoftrast.fb_viewportcenter[2]*y + dpsoftrast.fb_viewportcenter[3]*z + dpsoftrast.fb_viewportcenter[0]*w; 
1452         command->clipplane[0] = x;
1453         command->clipplane[1] = y;
1454         command->clipplane[2] = z;
1455         command->clipplane[3] = w;
1456 }
1457
1458 #ifdef SSE_POSSIBLE
1459 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1460 {
1461         float *end = dst + size*4;
1462         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1463         {
1464                 while (dst < end)
1465                 {
1466                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1467                         dst += 4;
1468                         src += stride;
1469                 }
1470         }
1471         else
1472         {
1473                 while (dst < end)
1474                 {
1475                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1476                         dst += 4;
1477                         src += stride;
1478                 }
1479         }
1480 }
1481
1482 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1483 {
1484         float *end = dst + size*4;
1485         if (stride == sizeof(float[3]))
1486         {
1487                 float *end4 = dst + (size&~3)*4;        
1488                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1489                 {
1490                         while (dst < end4)
1491                         {
1492                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1493                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1494                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1495                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1496                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1497                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1498                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1499                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1500                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1501                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1502                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1503                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1504                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1505                                 dst += 16;
1506                                 src += 4*sizeof(float[3]);
1507                         }
1508                 }
1509                 else
1510                 {
1511                         while (dst < end4)
1512                         {
1513                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1514                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1515                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1516                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1517                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1518                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1519                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1520                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1521                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1522                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1523                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1524                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1525                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1526                                 dst += 16;
1527                                 src += 4*sizeof(float[3]);
1528                         }
1529                 }
1530         }
1531         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1532         {
1533                 while (dst < end)
1534                 {
1535                         __m128 v = _mm_loadu_ps((const float *)src);
1536                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1537                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1538                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1539                         _mm_store_ps(dst, v);
1540                         dst += 4;
1541                         src += stride;
1542                 }
1543         }
1544         else
1545         {
1546                 while (dst < end)
1547                 {
1548                         __m128 v = _mm_load_ps((const float *)src);
1549                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1550                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1551                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1552                         _mm_store_ps(dst, v);
1553                         dst += 4;
1554                         src += stride;
1555                 }
1556         }
1557 }
1558
1559 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1560 {
1561         float *end = dst + size*4;
1562         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1563         if (stride == sizeof(float[2]))
1564         {
1565                 float *end2 = dst + (size&~1)*4;
1566                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1567                 {
1568                         while (dst < end2)
1569                         {
1570                                 __m128 v = _mm_loadu_ps((const float *)src);
1571                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1572                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1573                                 dst += 8;
1574                                 src += 2*sizeof(float[2]);
1575                         }
1576                 }
1577                 else
1578                 {
1579                         while (dst < end2)
1580                         {
1581                                 __m128 v = _mm_load_ps((const float *)src);
1582                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1583                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1584                                 dst += 8;
1585                                 src += 2*sizeof(float[2]);
1586                         }
1587                 }
1588         }
1589         while (dst < end)
1590         {
1591                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1592                 dst += 4;
1593                 src += stride;
1594         }
1595 }
1596
1597 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1598 {
1599         float *end = dst + size*4;
1600         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1601         if (stride == sizeof(unsigned char[4]))
1602         {
1603                 float *end4 = dst + (size&~3)*4;
1604                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1605                 {
1606                         while (dst < end4)
1607                         {
1608                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1609                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1610                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1611                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1612                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1613                                 dst += 16;
1614                                 src += 4*sizeof(unsigned char[4]);
1615                         }
1616                 }
1617                 else
1618                 {
1619                         while (dst < end4)
1620                         {
1621                                 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1622                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1623                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1624                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1625                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1626                                 dst += 16;
1627                                 src += 4*sizeof(unsigned char[4]);
1628                         }
1629                 }
1630         }
1631         while (dst < end)
1632         {
1633                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1634                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1635                 dst += 4;
1636                 src += stride;
1637         }
1638 }
1639
1640 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1641 {
1642         float *end = dst + 4*size;
1643         __m128 v = _mm_loadu_ps(src);
1644         while (dst < end)
1645         {
1646                 _mm_store_ps(dst, v);
1647                 dst += 4;
1648         }
1649 }
1650 #endif
1651
1652 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1653 {
1654 #ifdef SSE_POSSIBLE
1655         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1656         __m128 m0, m1, m2, m3;
1657         float *end;
1658         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1659         {
1660                 // fast case for identity matrix
1661                 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1662                 return;
1663         }
1664         end = out4f + numitems*4;
1665         m0 = _mm_loadu_ps(inmatrix16f);
1666         m1 = _mm_loadu_ps(inmatrix16f + 4);
1667         m2 = _mm_loadu_ps(inmatrix16f + 8);
1668         m3 = _mm_loadu_ps(inmatrix16f + 12);
1669         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1670         {
1671                 while (out4f < end)
1672                 {
1673                         __m128 v = _mm_loadu_ps(in4f);
1674                         _mm_store_ps(out4f,
1675                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1676                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1677                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1678                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1679                         out4f += 4;
1680                         in4f += 4;
1681                 }
1682         }
1683         else
1684         {
1685                 while (out4f < end)
1686                 {
1687                         __m128 v = _mm_load_ps(in4f);
1688                         _mm_store_ps(out4f,
1689                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1690                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1691                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1692                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1693                         out4f += 4;
1694                         in4f += 4;
1695                 }
1696         }
1697 #endif
1698 }
1699
1700 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1701 {
1702         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1703 }
1704
1705 #ifdef SSE_POSSIBLE
1706 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1707 { \
1708         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1709         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1710         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1711         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1712 }
1713
1714 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1715 { \
1716         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1717         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1718         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1719         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1720 }
1721
1722 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1723 { \
1724         __m128 p = (in); \
1725         out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1726                                                   _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1727                                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1728                                                                                         _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1729 }
1730
1731 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1732 {
1733         int clipmask = 0xFF;
1734         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1735         __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1736         __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1737         __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1738         m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1739         m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1740         m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1741         m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1742         #define BBFRONT(k, pos) \
1743         { \
1744                 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1745                 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1746                 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1747                 { \
1748                         __m128 proj; \
1749                         clipmask &= ~(1<<k); \
1750                         proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1751                         minproj = _mm_min_ss(minproj, proj); \
1752                         maxproj = _mm_max_ss(maxproj, proj); \
1753                 } \
1754         }
1755         BBFRONT(0, minpos); 
1756         BBFRONT(1, _mm_move_ss(minpos, maxpos)); 
1757         BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1758         BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1759         BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1760         BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1761         BBFRONT(6, _mm_move_ss(maxpos, minpos)); 
1762         BBFRONT(7, maxpos);
1763         #define BBCLIP(k) \
1764         { \
1765                 if (clipmask&(1<<k)) \
1766                 { \
1767                         if (!(clipmask&(1<<(k^1)))) \
1768                         { \
1769                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1770                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1771                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1772                                 minproj = _mm_min_ss(minproj, proj); \
1773                                 maxproj = _mm_max_ss(maxproj, proj); \
1774                         } \
1775                         if (!(clipmask&(1<<(k^2)))) \
1776                         { \
1777                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1778                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1779                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1780                                 minproj = _mm_min_ss(minproj, proj); \
1781                                 maxproj = _mm_max_ss(maxproj, proj); \
1782                         } \
1783                         if (!(clipmask&(1<<(k^4)))) \
1784                         { \
1785                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1786                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1787                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1788                                 minproj = _mm_min_ss(minproj, proj); \
1789                                 maxproj = _mm_max_ss(maxproj, proj); \
1790                         } \
1791                 } \
1792         }
1793         BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1794         viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1795         viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1796         minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1797         maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1798         minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1799         maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1800         *starty = _mm_cvttss_si32(maxproj);
1801         *endy = _mm_cvttss_si32(minproj)+1;
1802         return clipmask;
1803 }
1804         
1805 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1806 {
1807         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1808         float *end = out4f + numitems*4;
1809         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1810         __m128 minpos, maxpos;
1811         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1812         {
1813                 minpos = maxpos = _mm_loadu_ps(in4f);
1814                 while (out4f < end)
1815                 {
1816                         __m128 v = _mm_loadu_ps(in4f);
1817                         minpos = _mm_min_ps(minpos, v);
1818                         maxpos = _mm_max_ps(maxpos, v);
1819                         _mm_store_ps(out4f, v);
1820                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1821                         _mm_store_ps(screen4f, v);
1822                         in4f += 4;
1823                         out4f += 4;
1824                         screen4f += 4;
1825                 }
1826         }
1827         else
1828         {
1829                 minpos = maxpos = _mm_load_ps(in4f);
1830                 while (out4f < end)
1831                 {
1832                         __m128 v = _mm_load_ps(in4f);
1833                         minpos = _mm_min_ps(minpos, v);
1834                         maxpos = _mm_max_ps(maxpos, v);
1835                         _mm_store_ps(out4f, v);
1836                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1837                         _mm_store_ps(screen4f, v);
1838                         in4f += 4;
1839                         out4f += 4;
1840                         screen4f += 4;
1841                 }
1842         }
1843         if (starty && endy) 
1844         {
1845                 ALIGN(float minposf[4]);
1846                 ALIGN(float maxposf[4]);
1847                 _mm_store_ps(minposf, minpos);
1848                 _mm_store_ps(maxposf, maxpos);
1849                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1850         }
1851         return 0;
1852 }
1853
1854 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1855 {
1856         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1857         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1858         float *end;
1859         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1860                 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1861         end = out4f + numitems*4;
1862         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1863         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1864         m0 = _mm_loadu_ps(inmatrix16f);
1865         m1 = _mm_loadu_ps(inmatrix16f + 4);
1866         m2 = _mm_loadu_ps(inmatrix16f + 8);
1867         m3 = _mm_loadu_ps(inmatrix16f + 12);
1868         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1869         {
1870                 minpos = maxpos = _mm_loadu_ps(in4f);
1871                 while (out4f < end)
1872                 {
1873                         __m128 v = _mm_loadu_ps(in4f);
1874                         minpos = _mm_min_ps(minpos, v);
1875                         maxpos = _mm_max_ps(maxpos, v);
1876                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1877                         _mm_store_ps(out4f, v);
1878                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1879                         _mm_store_ps(screen4f, v);
1880                         in4f += 4;
1881                         out4f += 4;
1882                         screen4f += 4;
1883                 }
1884         }
1885         else
1886         {
1887                 minpos = maxpos = _mm_load_ps(in4f);
1888                 while (out4f < end)
1889                 {
1890                         __m128 v = _mm_load_ps(in4f);
1891                         minpos = _mm_min_ps(minpos, v);
1892                         maxpos = _mm_max_ps(maxpos, v);
1893                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1894                         _mm_store_ps(out4f, v);
1895                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1896                         _mm_store_ps(screen4f, v);
1897                         in4f += 4;
1898                         out4f += 4;
1899                         screen4f += 4;
1900                 }
1901         }
1902         if (starty && endy) 
1903         {
1904                 ALIGN(float minposf[4]);
1905                 ALIGN(float maxposf[4]);
1906                 _mm_store_ps(minposf, minpos);
1907                 _mm_store_ps(maxposf, maxpos);
1908                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f); 
1909         }
1910         return 0;
1911 }
1912 #endif
1913
1914 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1915 {
1916 #ifdef SSE_POSSIBLE
1917         float *outf = dpsoftrast.post_array4f[outarray];
1918         const unsigned char *inb;
1919         int firstvertex = dpsoftrast.firstvertex;
1920         int numvertices = dpsoftrast.numvertices;
1921         int stride;
1922         switch(inarray)
1923         {
1924         case DPSOFTRAST_ARRAY_POSITION:
1925                 stride = dpsoftrast.stride_vertex;
1926                 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1927                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1928                 break;
1929         case DPSOFTRAST_ARRAY_COLOR:
1930                 stride = dpsoftrast.stride_color;
1931                 if (dpsoftrast.pointer_color4f)
1932                 {
1933                         inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1934                         DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1935                 }
1936                 else if (dpsoftrast.pointer_color4ub)
1937                 {
1938                         stride = dpsoftrast.stride_color;
1939                         inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1940                         DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1941                 }
1942                 else
1943                 {
1944                         DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1945                 }
1946                 break;
1947         default:
1948                 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1949                 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1950                 {
1951                         inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1952                         switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1953                         {
1954                         case 2:
1955                                 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1956                                 break;
1957                         case 3:
1958                                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1959                                 break;
1960                         case 4:
1961                                 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1962                                 break;
1963                         }
1964                 }
1965                 break;
1966         }
1967         return outf;
1968 #else
1969         return NULL;
1970 #endif
1971 }
1972
1973 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1974 {
1975         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1976         DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1977         return data;
1978 }
1979
1980 #if 0
1981 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1982 {
1983 #ifdef SSE_POSSIBLE
1984         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1985         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1986         return data;
1987 #else
1988         return NULL;
1989 #endif
1990 }
1991 #endif
1992
1993 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1994 {
1995 #ifdef SSE_POSSIBLE
1996         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1997         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
1998         return data;
1999 #else
2000         return NULL;
2001 #endif
2002 }
2003
2004 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
2005 {
2006         int x;
2007         int startx = span->startx;
2008         int endx = span->endx;
2009         float wslope = triangle->w[0];
2010         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
2011         float endz = 1.0f / (w + wslope * startx);
2012         if (triangle->w[0] == 0)
2013         {
2014                 // LordHavoc: fast flat polygons (HUD/menu)
2015                 for (x = startx;x < endx;x++)
2016                         zf[x] = endz;
2017                 return;
2018         }
2019         for (x = startx;x < endx;)
2020         {
2021                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2022                 float z = endz, dz;
2023                 if (nextsub >= endx) nextsub = endsub = endx-1;
2024                 endz = 1.0f / (w + wslope * nextsub);
2025                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
2026                 for (; x <= endsub; x++, z += dz)
2027                         zf[x] = z;
2028         }
2029 }
2030
2031 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
2032 {
2033         int x;
2034         int startx = span->startx;
2035         int endx = span->endx;
2036         int d[4];
2037         float a, b;
2038         unsigned char * RESTRICT pixelmask = span->pixelmask;
2039         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2040         if (!pixel)
2041                 return;
2042         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2043         // handle alphatest now (this affects depth writes too)
2044         if (thread->alphatest)
2045                 for (x = startx;x < endx;x++)
2046                         if (in4f[x*4+3] < 0.5f)
2047                                 pixelmask[x] = false;
2048         // FIXME: this does not handle bigendian
2049         switch(thread->fb_blendmode)
2050         {
2051         case DPSOFTRAST_BLENDMODE_OPAQUE:
2052                 for (x = startx;x < endx;x++)
2053                 {
2054                         if (!pixelmask[x])
2055                                 continue;
2056                         d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
2057                         d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
2058                         d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
2059                         d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
2060                         pixel[x*4+0] = d[0];
2061                         pixel[x*4+1] = d[1];
2062                         pixel[x*4+2] = d[2];
2063                         pixel[x*4+3] = d[3];
2064                 }
2065                 break;
2066         case DPSOFTRAST_BLENDMODE_ALPHA:
2067                 for (x = startx;x < endx;x++)
2068                 {
2069                         if (!pixelmask[x])
2070                                 continue;
2071                         a = in4f[x*4+3] * 255.0f;
2072                         b = 1.0f - in4f[x*4+3];
2073                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2074                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2075                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2076                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2077                         pixel[x*4+0] = d[0];
2078                         pixel[x*4+1] = d[1];
2079                         pixel[x*4+2] = d[2];
2080                         pixel[x*4+3] = d[3];
2081                 }
2082                 break;
2083         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2084                 for (x = startx;x < endx;x++)
2085                 {
2086                         if (!pixelmask[x])
2087                                 continue;
2088                         a = in4f[x*4+3] * 255.0f;
2089                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2090                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2091                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2092                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2093                         pixel[x*4+0] = d[0];
2094                         pixel[x*4+1] = d[1];
2095                         pixel[x*4+2] = d[2];
2096                         pixel[x*4+3] = d[3];
2097                 }
2098                 break;
2099         case DPSOFTRAST_BLENDMODE_ADD:
2100                 for (x = startx;x < endx;x++)
2101                 {
2102                         if (!pixelmask[x])
2103                                 continue;
2104                         d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2105                         d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2106                         d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2107                         d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2108                         pixel[x*4+0] = d[0];
2109                         pixel[x*4+1] = d[1];
2110                         pixel[x*4+2] = d[2];
2111                         pixel[x*4+3] = d[3];
2112                 }
2113                 break;
2114         case DPSOFTRAST_BLENDMODE_INVMOD:
2115                 for (x = startx;x < endx;x++)
2116                 {
2117                         if (!pixelmask[x])
2118                                 continue;
2119                         d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2120                         d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2121                         d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2122                         d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2123                         pixel[x*4+0] = d[0];
2124                         pixel[x*4+1] = d[1];
2125                         pixel[x*4+2] = d[2];
2126                         pixel[x*4+3] = d[3];
2127                 }
2128                 break;
2129         case DPSOFTRAST_BLENDMODE_MUL:
2130                 for (x = startx;x < endx;x++)
2131                 {
2132                         if (!pixelmask[x])
2133                                 continue;
2134                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2135                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2136                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2137                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2138                         pixel[x*4+0] = d[0];
2139                         pixel[x*4+1] = d[1];
2140                         pixel[x*4+2] = d[2];
2141                         pixel[x*4+3] = d[3];
2142                 }
2143                 break;
2144         case DPSOFTRAST_BLENDMODE_MUL2:
2145                 for (x = startx;x < endx;x++)
2146                 {
2147                         if (!pixelmask[x])
2148                                 continue;
2149                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2150                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2151                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2152                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2153                         pixel[x*4+0] = d[0];
2154                         pixel[x*4+1] = d[1];
2155                         pixel[x*4+2] = d[2];
2156                         pixel[x*4+3] = d[3];
2157                 }
2158                 break;
2159         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2160                 for (x = startx;x < endx;x++)
2161                 {
2162                         if (!pixelmask[x])
2163                                 continue;
2164                         a = in4f[x*4+3] * -255.0f;
2165                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2166                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2167                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2168                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2169                         pixel[x*4+0] = d[0];
2170                         pixel[x*4+1] = d[1];
2171                         pixel[x*4+2] = d[2];
2172                         pixel[x*4+3] = d[3];
2173                 }
2174                 break;
2175         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2176                 for (x = startx;x < endx;x++)
2177                 {
2178                         if (!pixelmask[x])
2179                                 continue;
2180                         a = 255.0f;
2181                         b = 1.0f - in4f[x*4+3];
2182                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2183                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2184                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2185                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2186                         pixel[x*4+0] = d[0];
2187                         pixel[x*4+1] = d[1];
2188                         pixel[x*4+2] = d[2];
2189                         pixel[x*4+3] = d[3];
2190                 }
2191                 break;
2192         case DPSOFTRAST_BLENDMODE_INVADD:
2193                 for (x = startx;x < endx;x++)
2194                 {
2195                         if (!pixelmask[x])
2196                                 continue;
2197                         d[0] = (int)((255.0f-pixel[x*4+2])*in4f[x*4+0] + pixel[x*4+2]);if (d[0] > 255) d[0] = 255;
2198                         d[1] = (int)((255.0f-pixel[x*4+1])*in4f[x*4+1] + pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2199                         d[2] = (int)((255.0f-pixel[x*4+0])*in4f[x*4+2] + pixel[x*4+0]);if (d[2] > 255) d[2] = 255;
2200                         d[3] = (int)((255.0f-pixel[x*4+3])*in4f[x*4+3] + pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2201                         pixel[x*4+0] = d[0];
2202                         pixel[x*4+1] = d[1];
2203                         pixel[x*4+2] = d[2];
2204                         pixel[x*4+3] = d[3];
2205                 }
2206                 break;
2207         }
2208 }
2209
2210 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2211 {
2212 #ifdef SSE_POSSIBLE
2213         int x;
2214         int startx = span->startx;
2215         int endx = span->endx;
2216         int subx;
2217         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2218         unsigned char * RESTRICT pixelmask = span->pixelmask;
2219         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2220         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2221         if (!pixel)
2222                 return;
2223         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2224         pixeli += span->y * dpsoftrast.fb_width + span->x;
2225         // handle alphatest now (this affects depth writes too)
2226         if (thread->alphatest)
2227                 for (x = startx;x < endx;x++)
2228                         if (in4ub[x*4+3] < 128)
2229                                 pixelmask[x] = false;
2230         // LordHavoc: clear pixelmask for some pixels in alphablend cases, this
2231         // helps sprites, text and hud artwork
2232         switch(thread->fb_blendmode)
2233         {
2234         case DPSOFTRAST_BLENDMODE_ALPHA:
2235         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2236         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2237                 for (x = startx;x < endx;x++)
2238                         if (in4ub[x*4+3] < 1)
2239                                 pixelmask[x] = false;
2240                 break;
2241         case DPSOFTRAST_BLENDMODE_OPAQUE:
2242         case DPSOFTRAST_BLENDMODE_ADD:
2243         case DPSOFTRAST_BLENDMODE_INVMOD:
2244         case DPSOFTRAST_BLENDMODE_MUL:
2245         case DPSOFTRAST_BLENDMODE_MUL2:
2246         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2247         case DPSOFTRAST_BLENDMODE_INVADD:
2248                 break;
2249         }
2250         // put some special values at the end of the mask to ensure the loops end
2251         pixelmask[endx] = 1;
2252         pixelmask[endx+1] = 0;
2253         // LordHavoc: use a double loop to identify subspans, this helps the
2254         // optimized copy/blend loops to perform at their best, most triangles
2255         // have only one run of pixels, and do the search using wide reads...
2256         x = startx;
2257         while (x < endx)
2258         {
2259                 // if this pixel is masked off, it's probably not alone...
2260                 if (!pixelmask[x])
2261                 {
2262                         x++;
2263 #if 1
2264                         if (x + 8 < endx)
2265                         {
2266                                 // the 4-item search must be aligned or else it stalls badly
2267                                 if ((x & 3) && !pixelmask[x]) x++;
2268                                 if ((x & 3) && !pixelmask[x]) x++;
2269                                 if ((x & 3) && !pixelmask[x]) x++;
2270                                 while (*((unsigned int *)pixelmask + x) == 0x00000000)
2271                                         x += 4;
2272                         }
2273 #endif
2274                         for (;!pixelmask[x];x++)
2275                                 ;
2276                         // rather than continue the loop, just check the end variable
2277                         if (x >= endx)
2278                                 break;
2279                 }
2280                 // find length of subspan
2281                 subx = x + 1;
2282 #if 1
2283                 if (x + 8 < endx)
2284                 {
2285                         if ((subx & 3) && pixelmask[subx]) subx++;
2286                         if ((subx & 3) && pixelmask[subx]) subx++;
2287                         if ((subx & 3) && pixelmask[subx]) subx++;
2288                         while (*((unsigned int *)pixelmask + subx) == 0x01010101)
2289                                 subx += 4;
2290                 }
2291 #endif
2292                 for (;pixelmask[subx];subx++)
2293                         ;
2294                 // the checks can overshoot, so make sure to clip it...
2295                 if (subx > endx)
2296                         subx = endx;
2297                 // now that we know the subspan length...  process!
2298                 switch(thread->fb_blendmode)
2299                 {
2300                 case DPSOFTRAST_BLENDMODE_OPAQUE:
2301 #if 0
2302                         if (subx - x >= 16)
2303                         {
2304                                 memcpy(pixeli + x, ini + x, (subx - x) * sizeof(pixeli[x]));
2305                                 x = subx;
2306                         }
2307                         else
2308 #elif 1
2309                         while (x + 16 <= subx)
2310                         {
2311                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2312                                 _mm_storeu_si128((__m128i *)&pixeli[x+4], _mm_loadu_si128((const __m128i *)&ini[x+4]));
2313                                 _mm_storeu_si128((__m128i *)&pixeli[x+8], _mm_loadu_si128((const __m128i *)&ini[x+8]));
2314                                 _mm_storeu_si128((__m128i *)&pixeli[x+12], _mm_loadu_si128((const __m128i *)&ini[x+12]));
2315                                 x += 16;
2316                         }
2317 #endif
2318                         {
2319                                 while (x + 4 <= subx)
2320                                 {
2321                                         _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2322                                         x += 4;
2323                                 }
2324                                 if (x + 2 <= subx)
2325                                 {
2326                                         pixeli[x] = ini[x];
2327                                         pixeli[x+1] = ini[x+1];
2328                                         x += 2;
2329                                 }
2330                                 if (x < subx)
2331                                 {
2332                                         pixeli[x] = ini[x];
2333                                         x++;
2334                                 }
2335                         }
2336                         break;
2337                 case DPSOFTRAST_BLENDMODE_ALPHA:
2338                 #define FINISHBLEND(blend2, blend1) \
2339                         for (;x + 1 < subx;x += 2) \
2340                         { \
2341                                 __m128i src, dst; \
2342                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2343                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2344                                 blend2; \
2345                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2346                         } \
2347                         if (x < subx) \
2348                         { \
2349                                 __m128i src, dst; \
2350                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2351                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2352                                 blend1; \
2353                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2354                                 x++; \
2355                         }
2356                         FINISHBLEND({
2357                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2358                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2359                         }, {
2360                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2361                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2362                         });
2363                         break;
2364                 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2365                         FINISHBLEND({
2366                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2367                                 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2368                         }, {
2369                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2370                                 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2371                         });
2372                         break;
2373                 case DPSOFTRAST_BLENDMODE_ADD:
2374                         FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2375                         break;
2376                 case DPSOFTRAST_BLENDMODE_INVMOD:
2377                         FINISHBLEND({
2378                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2379                         }, {
2380                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2381                         });
2382                         break;
2383                 case DPSOFTRAST_BLENDMODE_MUL:
2384                         FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2385                         break;
2386                 case DPSOFTRAST_BLENDMODE_MUL2:
2387                         FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2388                         break;
2389                 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2390                         FINISHBLEND({
2391                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2392                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2393                         }, {
2394                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2395                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2396                         });
2397                         break;
2398                 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2399                         FINISHBLEND({
2400                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2401                                 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2402                         }, {
2403                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2404                                 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2405                         });
2406                         break;
2407                 case DPSOFTRAST_BLENDMODE_INVADD:
2408                         FINISHBLEND({
2409                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2410                         }, {
2411                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2412                         });
2413                         break;
2414                 }
2415         }
2416 #endif
2417 }
2418
2419 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2420 {
2421         int x;
2422         int startx = span->startx;
2423         int endx = span->endx;
2424         int flags;
2425         float c[4];
2426         float data[4];
2427         float slope[4];
2428         float tc[2], endtc[2];
2429         float tcscale[2];
2430         unsigned int tci[2];
2431         unsigned int tci1[2];
2432         unsigned int tcimin[2];
2433         unsigned int tcimax[2];
2434         int tciwrapmask[2];
2435         int tciwidth;
2436         int filter;
2437         int mip;
2438         const unsigned char * RESTRICT pixelbase;
2439         const unsigned char * RESTRICT pixel[4];
2440         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2441         // if no texture is bound, just fill it with white
2442         if (!texture)
2443         {
2444                 for (x = startx;x < endx;x++)
2445                 {
2446                         out4f[x*4+0] = 1.0f;
2447                         out4f[x*4+1] = 1.0f;
2448                         out4f[x*4+2] = 1.0f;
2449                         out4f[x*4+3] = 1.0f;
2450                 }
2451                 return;
2452         }
2453         mip = triangle->mip[texunitindex];
2454         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2455         // if this mipmap of the texture is 1 pixel, just fill it with that color
2456         if (texture->mipmap[mip][1] == 4)
2457         {
2458                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2459                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2460                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2461                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2462                 for (x = startx;x < endx;x++)
2463                 {
2464                         out4f[x*4+0] = c[0];
2465                         out4f[x*4+1] = c[1];
2466                         out4f[x*4+2] = c[2];
2467                         out4f[x*4+3] = c[3];
2468                 }
2469                 return;
2470         }
2471         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2472         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2473         flags = texture->flags;
2474         tcscale[0] = texture->mipmap[mip][2];
2475         tcscale[1] = texture->mipmap[mip][3];
2476         tciwidth = texture->mipmap[mip][2];
2477         tcimin[0] = 0;
2478         tcimin[1] = 0;
2479         tcimax[0] = texture->mipmap[mip][2]-1;
2480         tcimax[1] = texture->mipmap[mip][3]-1;
2481         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2482         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2483         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
2484         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
2485         if (filter)
2486         {
2487                 endtc[0] -= 0.5f;
2488                 endtc[1] -= 0.5f;
2489         }
2490         for (x = startx;x < endx;)
2491         {
2492                 unsigned int subtc[2];
2493                 unsigned int substep[2];
2494                 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2495                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2496                 if (nextsub >= endx)
2497                 {
2498                         nextsub = endsub = endx-1;      
2499                         if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2500                 }
2501                 tc[0] = endtc[0];
2502                 tc[1] = endtc[1];
2503                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2504                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2505                 if (filter)
2506                 {
2507                         endtc[0] -= 0.5f;
2508                         endtc[1] -= 0.5f;
2509                 }
2510                 substep[0] = (endtc[0] - tc[0]) * subscale;
2511                 substep[1] = (endtc[1] - tc[1]) * subscale;
2512                 subtc[0] = tc[0] * (1<<12);
2513                 subtc[1] = tc[1] * (1<<12);
2514                 if (filter)
2515                 {
2516                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2517                         {
2518                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2519                                 {
2520                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2521                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2522                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2523                                         tci[0] = subtc[0]>>12;
2524                                         tci[1] = subtc[1]>>12;
2525                                         tci1[0] = tci[0] + 1;
2526                                         tci1[1] = tci[1] + 1;
2527                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2528                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2529                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2530                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2531                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2532                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2533                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2534                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2535                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2536                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2537                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2538                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2539                                         out4f[x*4+0] = c[0];
2540                                         out4f[x*4+1] = c[1];
2541                                         out4f[x*4+2] = c[2];
2542                                         out4f[x*4+3] = c[3];
2543                                 }
2544                         }
2545                         else
2546                         {
2547                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2548                                 {
2549                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2550                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2551                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2552                                         tci[0] = subtc[0]>>12;
2553                                         tci[1] = subtc[1]>>12;
2554                                         tci1[0] = tci[0] + 1;
2555                                         tci1[1] = tci[1] + 1;
2556                                         tci[0] &= tciwrapmask[0];
2557                                         tci[1] &= tciwrapmask[1];
2558                                         tci1[0] &= tciwrapmask[0];
2559                                         tci1[1] &= tciwrapmask[1];
2560                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2561                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2562                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2563                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2564                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2565                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2566                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2567                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2568                                         out4f[x*4+0] = c[0];
2569                                         out4f[x*4+1] = c[1];
2570                                         out4f[x*4+2] = c[2];
2571                                         out4f[x*4+3] = c[3];
2572                                 }
2573                         }
2574                 }
2575                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2576                 {
2577                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2578                         {
2579                                 tci[0] = subtc[0]>>12;
2580                                 tci[1] = subtc[1]>>12;
2581                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2582                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2583                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2584                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2585                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2586                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2587                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2588                                 out4f[x*4+0] = c[0];
2589                                 out4f[x*4+1] = c[1];
2590                                 out4f[x*4+2] = c[2];
2591                                 out4f[x*4+3] = c[3];
2592                         }
2593                 }
2594                 else
2595                 {
2596                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2597                         {
2598                                 tci[0] = subtc[0]>>12;
2599                                 tci[1] = subtc[1]>>12;
2600                                 tci[0] &= tciwrapmask[0];
2601                                 tci[1] &= tciwrapmask[1];
2602                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2603                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2604                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2605                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2606                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2607                                 out4f[x*4+0] = c[0];
2608                                 out4f[x*4+1] = c[1];
2609                                 out4f[x*4+2] = c[2];
2610                                 out4f[x*4+3] = c[3];
2611                         }
2612                 }
2613         }
2614 }
2615
2616 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2617 {
2618 #ifdef SSE_POSSIBLE
2619         int x;
2620         int startx = span->startx;
2621         int endx = span->endx;
2622         int flags;
2623         __m128 data, slope, tcscale;
2624         __m128i tcsize, tcmask, tcoffset, tcmax;
2625         __m128 tc, endtc;
2626         __m128i subtc, substep, endsubtc;
2627         int filter;
2628         int mip;
2629         int affine; // LordHavoc: optimized affine texturing case
2630         unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2631         const unsigned char * RESTRICT pixelbase;
2632         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2633         // if no texture is bound, just fill it with white
2634         if (!texture)
2635         {
2636                 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2637                 return;
2638         }
2639         mip = triangle->mip[texunitindex];
2640         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2641         // if this mipmap of the texture is 1 pixel, just fill it with that color
2642         if (texture->mipmap[mip][1] == 4)
2643         {
2644                 unsigned int k = *((const unsigned int *)pixelbase);
2645                 for (x = startx;x < endx;x++)
2646                         outi[x] = k;
2647                 return;
2648         }
2649         affine = zf[startx] == zf[endx-1];
2650         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2651         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2652         flags = texture->flags;
2653         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2654         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2655         tcscale = _mm_cvtepi32_ps(tcsize);
2656         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2657         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2658         endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2659         if (filter)
2660                 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2661         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2662         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2663         tcmax = _mm_packs_epi32(tcmask, tcmask);
2664         for (x = startx;x < endx;)
2665         {
2666                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2667                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2668                 if (nextsub >= endx || affine)
2669                 {
2670                         nextsub = endsub = endx-1;
2671                         if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2672                 }       
2673                 tc = endtc;
2674                 subtc = endsubtc;
2675                 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2676                 if (filter)
2677                         endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2678                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2679                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2680                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2681                 substep = _mm_slli_epi32(substep, 1);
2682                 if (filter)
2683                 {
2684                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2685                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2686                         {
2687                                 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2688                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2689                                 {
2690                                         const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;                   
2691                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2692                                         tci = _mm_madd_epi16(tci, tcoffset);
2693                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2694                                         ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2695                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2696                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2697                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2698                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2699                                         fracm = _mm_srli_epi16(subtc, 1);
2700                                         pix1 = _mm_add_epi16(pix1,
2701                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2702                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2703                                         pix3 = _mm_add_epi16(pix3,
2704                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2705                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2706                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2707                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2708                                         pix2 = _mm_add_epi16(pix2,
2709                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2710                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2711                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2712                                 }
2713                                 if (x <= endsub)
2714                                 {
2715                                         const unsigned char * RESTRICT ptr1;
2716                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2717                                         tci = _mm_madd_epi16(tci, tcoffset);
2718                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2719                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2720                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2721                                         fracm = _mm_srli_epi16(subtc, 1);
2722                                         pix1 = _mm_add_epi16(pix1,
2723                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2724                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2725                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2726                                         pix1 = _mm_add_epi16(pix1,
2727                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2728                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2729                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2730                                         x++;
2731                                 }
2732                         }
2733                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2734                         {
2735                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2736                                 {
2737                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2738                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2739                                         tci = _mm_madd_epi16(tci, tcoffset);
2740                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2741                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2742                                                                                         _mm_setzero_si128());
2743                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2744                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2745                                                                                         _mm_setzero_si128());
2746                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2747                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2748                                         tci = _mm_madd_epi16(tci, tcoffset);
2749                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2750                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2751                                                                                         _mm_setzero_si128());
2752                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2753                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2754                                                                                         _mm_setzero_si128());
2755                                         fracm = _mm_srli_epi16(subtc, 1);
2756                                         pix1 = _mm_add_epi16(pix1,
2757                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2758                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2759                                         pix3 = _mm_add_epi16(pix3,
2760                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2761                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2762                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2763                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2764                                         pix2 = _mm_add_epi16(pix2,
2765                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2766                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2767                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2768                                 }
2769                                 if (x <= endsub)
2770                                 {
2771                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2772                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2773                                         tci = _mm_madd_epi16(tci, tcoffset);
2774                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2775                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2776                                                                                         _mm_setzero_si128());
2777                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2778                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2779                                                                                         _mm_setzero_si128());
2780                                         fracm = _mm_srli_epi16(subtc, 1);
2781                                         pix1 = _mm_add_epi16(pix1,
2782                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2783                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2784                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2785                                         pix1 = _mm_add_epi16(pix1,
2786                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2787                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2788                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2789                                         x++;
2790                                 }
2791                         }
2792                         else
2793                         {
2794                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2795                                 {
2796                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2797                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2798                                         tci = _mm_madd_epi16(tci, tcoffset);
2799                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2800                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2801                                                                                         _mm_setzero_si128());
2802                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2803                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2804                                                                                         _mm_setzero_si128());
2805                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2806                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2807                                         tci = _mm_madd_epi16(tci, tcoffset);
2808                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2809                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2810                                                                                         _mm_setzero_si128());
2811                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2812                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2813                                                                                         _mm_setzero_si128());
2814                                         fracm = _mm_srli_epi16(subtc, 1);
2815                                         pix1 = _mm_add_epi16(pix1,
2816                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2817                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2818                                         pix3 = _mm_add_epi16(pix3,
2819                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2820                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2821                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2822                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2823                                         pix2 = _mm_add_epi16(pix2,
2824                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2825                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2826                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2827                                 }
2828                                 if (x <= endsub)
2829                                 {
2830                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2831                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2832                                         tci = _mm_madd_epi16(tci, tcoffset);
2833                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2834                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2835                                                                                         _mm_setzero_si128());
2836                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2837                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2838                                                                                         _mm_setzero_si128());
2839                                         fracm = _mm_srli_epi16(subtc, 1);
2840                                         pix1 = _mm_add_epi16(pix1,
2841                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2842                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2843                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2844                                         pix1 = _mm_add_epi16(pix1,
2845                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2846                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2847                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2848                                         x++;
2849                                 }
2850                         }
2851                 }
2852                 else
2853                 {
2854                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2855                         {
2856                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2857                                 {
2858                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2859                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2860                                         tci = _mm_madd_epi16(tci, tcoffset);
2861                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2862                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2863                                 }
2864                                 if (x <= endsub)
2865                                 {
2866                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2867                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2868                                         tci = _mm_madd_epi16(tci, tcoffset);
2869                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2870                                         x++;
2871                                 }
2872                         }
2873                         else
2874                         {
2875                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2876                                 {
2877                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2878                                         tci = _mm_and_si128(tci, tcmax); 
2879                                         tci = _mm_madd_epi16(tci, tcoffset);
2880                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2881                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2882                                 }
2883                                 if (x <= endsub)
2884                                 {
2885                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2886                                         tci = _mm_and_si128(tci, tcmax); 
2887                                         tci = _mm_madd_epi16(tci, tcoffset);
2888                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2889                                         x++;
2890                                 }
2891                         }
2892                 }
2893         }
2894 #endif
2895 }
2896
2897 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2898 {
2899         // TODO: IMPLEMENT
2900         memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2901 }
2902
2903 float DPSOFTRAST_SampleShadowmap(const float *vector)
2904 {
2905         // TODO: IMPLEMENT
2906         return 1.0f;
2907 }
2908
2909 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2910 {
2911         int x;
2912         int startx = span->startx;
2913         int endx = span->endx;
2914         float c[4];
2915         float data[4];
2916         float slope[4];
2917         float z;
2918         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2919         for (x = startx;x < endx;x++)
2920         {
2921                 z = zf[x];
2922                 c[0] = (data[0] + slope[0]*x) * z;
2923                 c[1] = (data[1] + slope[1]*x) * z;
2924                 c[2] = (data[2] + slope[2]*x) * z;
2925                 c[3] = (data[3] + slope[3]*x) * z;
2926                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2927                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2928                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2929                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2930         }
2931 }
2932
2933 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2934 {
2935         int x;
2936         int startx = span->startx;
2937         int endx = span->endx;
2938         float c[4];
2939         float data[4];
2940         float slope[4];
2941         float z;
2942         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2943         for (x = startx;x < endx;x++)
2944         {
2945                 z = zf[x];
2946                 c[0] = (data[0] + slope[0]*x) * z;
2947                 c[1] = (data[1] + slope[1]*x) * z;
2948                 c[2] = (data[2] + slope[2]*x) * z;
2949                 c[3] = (data[3] + slope[3]*x) * z;
2950                 out4f[x*4+0] = c[0];
2951                 out4f[x*4+1] = c[1];
2952                 out4f[x*4+2] = c[2];
2953                 out4f[x*4+3] = c[3];
2954         }
2955 }
2956
2957 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2958 {
2959         int x, startx = span->startx, endx = span->endx;
2960         float c[4], localcolor[4];
2961         localcolor[0] = subcolor[0];
2962         localcolor[1] = subcolor[1];
2963         localcolor[2] = subcolor[2];
2964         localcolor[3] = subcolor[3];
2965         for (x = startx;x < endx;x++)
2966         {
2967                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2968                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2969                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2970                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2971                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2972                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2973                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2974                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2975         }
2976 }
2977
2978 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2979 {
2980         int x, startx = span->startx, endx = span->endx;
2981         for (x = startx;x < endx;x++)
2982         {
2983                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2984                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2985                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2986                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2987         }
2988 }
2989
2990 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2991 {
2992         int x, startx = span->startx, endx = span->endx;
2993         for (x = startx;x < endx;x++)
2994         {
2995                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2996                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2997                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2998                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2999         }
3000 }
3001
3002 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
3003 {
3004         int x, startx = span->startx, endx = span->endx;
3005         float a, b;
3006         for (x = startx;x < endx;x++)
3007         {
3008                 a = 1.0f - inb4f[x*4+3];
3009                 b = inb4f[x*4+3];
3010                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
3011                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
3012                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
3013                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
3014         }
3015 }
3016
3017 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
3018 {
3019         int x, startx = span->startx, endx = span->endx;
3020         float localcolor[4], ilerp, lerp;
3021         localcolor[0] = color[0];
3022         localcolor[1] = color[1];
3023         localcolor[2] = color[2];
3024         localcolor[3] = color[3];
3025         ilerp = 1.0f - localcolor[3];
3026         lerp = localcolor[3];
3027         for (x = startx;x < endx;x++)
3028         {
3029                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
3030                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
3031                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
3032                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
3033         }
3034 }
3035
3036
3037
3038 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
3039 {
3040 #ifdef SSE_POSSIBLE
3041         int x;
3042         int startx = span->startx;
3043         int endx = span->endx;
3044         __m128 data, slope;
3045         __m128 mod, endmod;
3046         __m128i submod, substep, endsubmod;
3047         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3048         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3049         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3050         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3051         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3052         for (x = startx; x < endx;)
3053         {
3054                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3055                 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3056                 if (nextsub >= endx)
3057                 {
3058                         nextsub = endsub = endx-1;
3059                         if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
3060                 }
3061                 mod = endmod;
3062                 submod = endsubmod;
3063                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3064                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3065                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3066                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3067                 substep = _mm_packs_epi32(substep, substep);
3068                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3069                 {
3070                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
3071                         pix = _mm_mulhi_epu16(pix, submod);
3072                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3073                 }
3074                 if (x <= endsub)
3075                 {
3076                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
3077                         pix = _mm_mulhi_epu16(pix, submod);
3078                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3079                         x++;
3080                 }
3081         }
3082 #endif
3083 }
3084
3085 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
3086 {
3087 #ifdef SSE_POSSIBLE
3088         int x;
3089         int startx = span->startx;
3090         int endx = span->endx;
3091         __m128 data, slope;
3092         __m128 mod, endmod;
3093         __m128i submod, substep, endsubmod;
3094         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3095         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3096         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3097         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3098         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3099         for (x = startx; x < endx;)
3100         {
3101                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3102                 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3103                 if (nextsub >= endx)
3104                 {
3105                         nextsub = endsub = endx-1;
3106                         if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
3107                 }
3108                 mod = endmod;
3109                 submod = endsubmod;
3110                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3111                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3112                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3113                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3114                 substep = _mm_packs_epi32(substep, substep);
3115                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3116                 {
3117                         __m128i pix = _mm_srai_epi16(submod, 4);
3118                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3119                 }
3120                 if (x <= endsub)
3121                 {
3122                         __m128i pix = _mm_srai_epi16(submod, 4);
3123                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3124                         x++;
3125                 }
3126         }
3127 #endif
3128 }
3129
3130 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3131 {
3132 #ifdef SSE_POSSIBLE
3133         int x, startx = span->startx, endx = span->endx;
3134         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3135         localcolor = _mm_packs_epi32(localcolor, localcolor);
3136         for (x = startx;x+2 <= endx;x+=2)
3137         {
3138                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3139                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3140                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3141                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3142         }
3143         if (x < endx)
3144         {
3145                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3146                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3147                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3148                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3149         }
3150 #endif
3151 }
3152
3153 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3154 {
3155 #ifdef SSE_POSSIBLE
3156         int x, startx = span->startx, endx = span->endx;
3157         for (x = startx;x+2 <= endx;x+=2)
3158         {
3159                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3160                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3161                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3162                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3163         }
3164         if (x < endx)
3165         {
3166                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3167                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3168                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3169                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3170         }
3171 #endif
3172 }
3173
3174 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3175 {
3176 #ifdef SSE_POSSIBLE
3177         int x, startx = span->startx, endx = span->endx;
3178         for (x = startx;x+2 <= endx;x+=2)
3179         {
3180                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3181                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3182                 pix1 = _mm_add_epi16(pix1, pix2);
3183                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3184         }
3185         if (x < endx)
3186         {
3187                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3188                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3189                 pix1 = _mm_add_epi16(pix1, pix2);
3190                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3191         }
3192 #endif
3193 }
3194
3195 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3196 {
3197 #ifdef SSE_POSSIBLE
3198         int x, startx = span->startx, endx = span->endx;
3199         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3200         tint = _mm_packs_epi32(tint, tint);
3201         for (x = startx;x+2 <= endx;x+=2)
3202         {
3203                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3204                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3205                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3206                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3207         }
3208         if (x < endx)
3209         {
3210                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3211                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3212                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3213                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3214         }
3215 #endif
3216 }
3217
3218 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3219 {
3220 #ifdef SSE_POSSIBLE
3221         int x, startx = span->startx, endx = span->endx;
3222         for (x = startx;x+2 <= endx;x+=2)
3223         {
3224                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3225                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3226                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3227                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3228                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3229         }
3230         if (x < endx)
3231         {
3232                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3233                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3234                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3235                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3236                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3237         }
3238 #endif
3239 }
3240
3241 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3242 {
3243 #ifdef SSE_POSSIBLE
3244         int x, startx = span->startx, endx = span->endx;
3245         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3246         localcolor = _mm_packs_epi32(localcolor, localcolor);
3247         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3248         for (x = startx;x+2 <= endx;x+=2)
3249         {
3250                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3251                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3252                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3253         }
3254         if (x < endx)
3255         {
3256                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3257                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3258                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3259         }
3260 #endif
3261 }
3262
3263
3264
3265 void DPSOFTRAST_VertexShader_Generic(void)
3266 {
3267         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3268         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3269         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3270         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3271                 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3272 }
3273
3274 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3275 {
3276         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3277         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3278         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3279         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3280         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3281         if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3282         {
3283                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3284                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3285                 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3286                 {
3287                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3288                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3289                         {
3290                                 // multiply
3291                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3292                         }
3293                         else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3294                         {
3295                                 // add
3296                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3297                         }
3298                         else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3299                         {
3300                                 // alphablend
3301                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3302                         }
3303                 }
3304         }
3305         else
3306                 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3307         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3308 }
3309
3310
3311
3312 void DPSOFTRAST_VertexShader_PostProcess(void)
3313 {
3314         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3315         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3316         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3317 }
3318
3319 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3320 {
3321         // TODO: optimize!!  at the very least there is no reason to use texture sampling on the frame texture
3322         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3323         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3324         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3325         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3326         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3327         if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3328         {
3329                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3330                 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3331         }
3332         DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3333         if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3334         {
3335                 // TODO: implement saturation
3336         }
3337         if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3338         {
3339                 // TODO: implement gammaramps
3340         }
3341         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3342 }
3343
3344
3345
3346 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3347 {
3348         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3349 }
3350
3351 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3352 {
3353         // this is never called (because colormask is off when this shader is used)
3354         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3355         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3356         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3357         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3358         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3359 }
3360
3361
3362
3363 void DPSOFTRAST_VertexShader_FlatColor(void)
3364 {
3365         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3366         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3367 }
3368
3369 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3370 {
3371 #ifdef SSE_POSSIBLE
3372         unsigned char * RESTRICT pixelmask = span->pixelmask;
3373         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3374         int x, startx = span->startx, endx = span->endx;
3375         __m128i Color_Ambientm;
3376         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3377         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3378         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3379         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3380         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3381         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3382                 pixel = buffer_FragColorbgra8;
3383         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3384         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3385         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3386         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3387         for (x = startx;x < endx;x++)
3388         {
3389                 __m128i color, pix;
3390                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3391                 {
3392                         __m128i pix2;
3393                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3394                         pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3395                         pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3396                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3397                         x += 3;
3398                         continue;
3399                 }
3400                 if (!pixelmask[x])
3401                         continue;
3402                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3403                 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3404                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3405         }
3406         if (pixel == buffer_FragColorbgra8)
3407                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3408 #endif
3409 }
3410
3411
3412
3413 void DPSOFTRAST_VertexShader_VertexColor(void)
3414 {
3415         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3416         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3417         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3418 }
3419
3420 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3421 {
3422 #ifdef SSE_POSSIBLE
3423         unsigned char * RESTRICT pixelmask = span->pixelmask;
3424         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3425         int x, startx = span->startx, endx = span->endx;
3426         __m128i Color_Ambientm, Color_Diffusem;
3427         __m128 data, slope;
3428         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3429         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3430         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3431         int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3432         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3433         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3434         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3435                 pixel = buffer_FragColorbgra8;
3436         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3437         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3438         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3439         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3440         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3441         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3442         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3443         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3444         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3445         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3446         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3447         data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3448         slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3449         for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3450         {
3451                 __m128i color, mod, pix;
3452                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3453                 {
3454                         __m128i pix2, mod2;
3455                         __m128 z = _mm_loadu_ps(&buffer_z[x]);
3456                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3457                         mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3458                         data = _mm_add_ps(data, slope);
3459                         mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3460                         data = _mm_add_ps(data, slope);
3461                         mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3462                         data = _mm_add_ps(data, slope);
3463                         mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3464                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3465                                                                   _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3466                         pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3467                                                                    _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3468                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3469                         x += 3;
3470                         continue;
3471                 }
3472                 if (!pixelmask[x])
3473                         continue;
3474                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3475                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
3476                 mod = _mm_packs_epi32(mod, mod);
3477                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3478                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3479         }
3480         if (pixel == buffer_FragColorbgra8)
3481                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3482 #endif
3483 }
3484
3485
3486
3487 void DPSOFTRAST_VertexShader_Lightmap(void)
3488 {
3489         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3490         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3491         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3492 }
3493
3494 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3495 {
3496 #ifdef SSE_POSSIBLE
3497         unsigned char * RESTRICT pixelmask = span->pixelmask;
3498         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3499         int x, startx = span->startx, endx = span->endx;
3500         __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3501         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3502         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3503         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3504         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3505         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3506         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3507         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3508         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3509         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3510                 pixel = buffer_FragColorbgra8;
3511         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3512         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3513         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3514         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3515         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3516         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3517         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3518         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3519         {
3520                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3521                 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3522                 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3523                 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3524                 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3525                 for (x = startx;x < endx;x++)
3526                 {
3527                         __m128i color, lightmap, glow, pix;
3528                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3529                         {
3530                                 __m128i pix2;
3531                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3532                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3533                                 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3534                                 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3535                                                                                                         _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3536                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3537                                 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3538                                                                                                         _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3539                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3540                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3541                                 x += 3;
3542                                 continue;
3543                         }
3544                         if (!pixelmask[x])
3545                                 continue;
3546                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3547                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3548                         glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3549                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3550                         pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3551                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3552                 }
3553         }
3554         else
3555         {
3556                 for (x = startx;x < endx;x++)
3557                 {
3558                         __m128i color, lightmap, pix;
3559                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3560                         {
3561                                 __m128i pix2;
3562                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3563                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3564                                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3565                                                                           _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3566                                 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3567                                                                            _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3568                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3569                                 x += 3;
3570                                 continue;
3571                         }
3572                         if (!pixelmask[x]) 
3573                                 continue;
3574                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3575                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3576                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3577                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3578                 }
3579         }
3580         if (pixel == buffer_FragColorbgra8)
3581                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3582 #endif
3583 }
3584
3585
3586 void DPSOFTRAST_VertexShader_LightDirection(void);
3587 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3588
3589 void DPSOFTRAST_VertexShader_FakeLight(void)
3590 {
3591         DPSOFTRAST_VertexShader_LightDirection();
3592 }
3593
3594 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3595 {
3596         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3597 }
3598
3599
3600
3601 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3602 {
3603         DPSOFTRAST_VertexShader_LightDirection();
3604         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3605 }
3606
3607 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3608 {
3609         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3610 }
3611
3612
3613
3614 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3615 {
3616         DPSOFTRAST_VertexShader_LightDirection();
3617         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3618 }
3619
3620 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3621 {
3622         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3623 }
3624
3625
3626
3627 void DPSOFTRAST_VertexShader_LightDirection(void)
3628 {
3629         int i;
3630         int numvertices = dpsoftrast.numvertices;
3631         float LightDir[4];
3632         float LightVector[4];
3633         float EyePosition[4];
3634         float EyeVectorModelSpace[4];
3635         float EyeVector[4];
3636         float position[4];
3637         float svector[4];
3638         float tvector[4];
3639         float normal[4];
3640         LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3641         LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3642         LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3643         LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3644         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3645         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3646         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3647         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3648         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3649         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3650         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3651         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3652         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3653         for (i = 0;i < numvertices;i++)
3654         {
3655                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3656                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3657                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3658                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3659                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3660                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3661                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3662                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3663                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3664                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3665                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3666                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3667                 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3668                 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3669                 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3670                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3671                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3672                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3673                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3674                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3675                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3676                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3677                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3678                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3679                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3680                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3681                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3682                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3683                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3684         }
3685         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3686 }
3687
3688 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3689 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3690 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3691 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3692 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3693 #define DPSOFTRAST_Vector3Normalize(v)\
3694 do\
3695 {\
3696         float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3697         if (len)\
3698         {\
3699                 len = 1.0f / len;\
3700                 v[0] *= len;\
3701                 v[1] *= len;\
3702                 v[2] *= len;\
3703         }\
3704 }\
3705 while(0)
3706
3707 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3708 {
3709         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3710         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3711         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3712         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3713         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3714         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3715         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3716         unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3717         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3718         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3719         int x, startx = span->startx, endx = span->endx;
3720         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3721         float LightVectordata[4];
3722         float LightVectorslope[4];
3723         float EyeVectordata[4];
3724         float EyeVectorslope[4];
3725         float VectorSdata[4];
3726         float VectorSslope[4];
3727         float VectorTdata[4];
3728         float VectorTslope[4];
3729         float VectorRdata[4];
3730         float VectorRslope[4];
3731         float z;
3732         float diffusetex[4];
3733         float glosstex[4];
3734         float surfacenormal[4];
3735         float lightnormal[4];
3736         float lightnormal_modelspace[4];
3737         float eyenormal[4];
3738         float specularnormal[4];
3739         float diffuse;
3740         float specular;
3741         float SpecularPower;
3742         int d[4];
3743         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3744         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3745         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3746         Color_Glow[3] = 0.0f;
3747         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3748         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3749         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3750         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3751         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3752         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3753         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3754         Color_Pants[3] = 0.0f;
3755         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3756         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3757         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3758         Color_Shirt[3] = 0.0f;
3759         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3760         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3761         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3762         {
3763                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3764                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3765         }
3766         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3767         {
3768                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3769         }
3770         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3771         {
3772                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3773                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3774                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3775                 Color_Diffuse[3] = 0.0f;
3776                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3777                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3778                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3779                 LightColor[3] = 0.0f;
3780                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3781                 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3782                 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3783                 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3784                 Color_Specular[3] = 0.0f;
3785                 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3786                 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3787                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3788
3789                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3790                 {
3791                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3792                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3793                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3794                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3795                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3796                 }
3797                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3798                 {
3799                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3800                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3801                 }
3802                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3803                 {
3804                         // nothing of this needed
3805                 }
3806                 else
3807                 {
3808                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3809                 }
3810
3811                 for (x = startx;x < endx;x++)
3812                 {
3813                         z = buffer_z[x];
3814                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3815                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3816                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3817                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3818                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3819                         {
3820                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3821                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3822                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3823                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3824                         }
3825                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3826                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3827                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3828                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3829                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3830                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3831                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3832                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3833
3834                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3835                         {
3836                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3837                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3838                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3839                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3840
3841                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3842                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3843                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3844                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3845
3846                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3847                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3848                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3849                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3850
3851                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3852                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3853                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3854                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3855
3856                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3857                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3858
3859                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3860                                 {
3861                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3862                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3863                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3864                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3865                                 }
3866                         }
3867                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3868                         {
3869                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3870                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3871                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3872                                 {
3873                                         float f = 1.0f / 256.0f;
3874                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3875                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3876                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3877                                 }
3878                         }
3879                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3880                         {
3881                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3882                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3883                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3884                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3885
3886                                 LightColor[0] = 1.0;
3887                                 LightColor[1] = 1.0;
3888                                 LightColor[2] = 1.0;
3889                         }
3890                         else
3891                         {
3892                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3893                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3894                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3895                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3896                         }
3897
3898                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3899
3900                         if(thread->shader_exactspecularmath)
3901                         {
3902                                 // reflect lightnormal at surfacenormal, take the negative of that
3903                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3904                                 float f;
3905                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3906                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3907                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3908                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3909
3910                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
3911                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3912                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3913                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3914                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3915
3916                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3917                         }
3918                         else
3919                         {
3920                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3921                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3922                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3923                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3924
3925                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
3926                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
3927                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
3928                                 DPSOFTRAST_Vector3Normalize(specularnormal);
3929
3930                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3931                         }
3932
3933                         specular = pow(specular, SpecularPower * glosstex[3]);
3934                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3935                         {
3936                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3937                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3938                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3939                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3940                         }
3941                         else
3942                         {
3943                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3944                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3945                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3946                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3947                         }
3948
3949                         buffer_FragColorbgra8[x*4+0] = d[0];
3950                         buffer_FragColorbgra8[x*4+1] = d[1];
3951                         buffer_FragColorbgra8[x*4+2] = d[2];
3952                         buffer_FragColorbgra8[x*4+3] = d[3];
3953                 }
3954         }
3955         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3956         {
3957                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3958                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3959                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3960                 Color_Diffuse[3] = 0.0f;
3961                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3962                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3963                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3964                 LightColor[3] = 0.0f;
3965                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3966
3967                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3968                 {
3969                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3970                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3971                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3972                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3973                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3974                 }
3975                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3976                 {
3977                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3978                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3979                 }
3980                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3981                 {
3982                         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3983                 }
3984                 else
3985                 {
3986                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3987                 }
3988
3989                 for (x = startx;x < endx;x++)
3990                 {
3991                         z = buffer_z[x];
3992                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3993                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3994                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3995                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3996                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3997                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3998                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3999                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4000
4001                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
4002                         {
4003                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
4004                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4005                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4006                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4007
4008                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
4009                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
4010                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
4011                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
4012
4013                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
4014                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
4015                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
4016                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
4017
4018                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
4019                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
4020                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
4021                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
4022
4023                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
4024                                 DPSOFTRAST_Vector3Normalize(lightnormal);
4025
4026                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
4027                                 {
4028                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
4029                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
4030                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
4031                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
4032                                 }
4033                         }
4034                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
4035                         {
4036                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4037                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4038                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4039                                 {
4040                                         float f = 1.0f / 256.0f;
4041                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
4042                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
4043                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
4044                                 }
4045                         }
4046                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
4047                         {
4048                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4049                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4050                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4051                                 DPSOFTRAST_Vector3Normalize(lightnormal);
4052
4053                                 LightColor[0] = 1.0;
4054                                 LightColor[1] = 1.0;
4055                                 LightColor[2] = 1.0;
4056                         }
4057                         else
4058                         {
4059                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4060                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4061                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4062                                 DPSOFTRAST_Vector3Normalize(lightnormal);
4063                         }
4064
4065                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4066                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4067                         {
4068                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4069                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4070                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4071                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
4072                         }
4073                         else
4074                         {
4075                                 d[0] = (int)(                                                + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4076                                 d[1] = (int)(                                                + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4077                                 d[2] = (int)(                                                + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4078                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
4079                         }
4080                         buffer_FragColorbgra8[x*4+0] = d[0];
4081                         buffer_FragColorbgra8[x*4+1] = d[1];
4082                         buffer_FragColorbgra8[x*4+2] = d[2];
4083                         buffer_FragColorbgra8[x*4+3] = d[3];
4084                 }
4085         }
4086         else
4087         {
4088                 for (x = startx;x < endx;x++)
4089                 {
4090                         z = buffer_z[x];
4091                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4092                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4093                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4094                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4095
4096                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4097                         {
4098                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4099                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4100                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4101                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4102                         }
4103                         else
4104                         {
4105                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4106                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4107                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4108                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4109                         }
4110                         buffer_FragColorbgra8[x*4+0] = d[0];
4111                         buffer_FragColorbgra8[x*4+1] = d[1];
4112                         buffer_FragColorbgra8[x*4+2] = d[2];
4113                         buffer_FragColorbgra8[x*4+3] = d[3];
4114                 }
4115         }
4116         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4117 }
4118
4119
4120
4121 void DPSOFTRAST_VertexShader_LightSource(void)
4122 {
4123         int i;
4124         int numvertices = dpsoftrast.numvertices;
4125         float LightPosition[4];
4126         float LightVector[4];
4127         float LightVectorModelSpace[4];
4128         float EyePosition[4];
4129         float EyeVectorModelSpace[4];
4130         float EyeVector[4];
4131         float position[4];
4132         float svector[4];
4133         float tvector[4];
4134         float normal[4];
4135         LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4136         LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4137         LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4138         LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4139         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4140         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4141         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4142         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4143         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4144         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4145         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4146         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4147         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4148         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4149         for (i = 0;i < numvertices;i++)
4150         {
4151                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4152                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4153                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4154                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4155                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4156                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4157                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4158                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4159                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4160                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4161                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4162                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4163                 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4164                 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4165                 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4166                 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4167                 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4168                 LightVector[2] = normal[0]  * LightVectorModelSpace[0] + normal[1]  * LightVectorModelSpace[1] + normal[2]  * LightVectorModelSpace[2];
4169                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4170                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4171                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4172                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4173                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4174                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4175                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4176                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4177                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4178                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
4179                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4180                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4181                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4182                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4183         }
4184         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4185         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4186 }
4187
4188 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4189 {
4190 #ifdef SSE_POSSIBLE
4191         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4192         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4193         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4194         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4195         unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4196         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4197         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4198         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4199         int x, startx = span->startx, endx = span->endx;
4200         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4201         float CubeVectordata[4];
4202         float CubeVectorslope[4];
4203         float LightVectordata[4];
4204         float LightVectorslope[4];
4205         float EyeVectordata[4];
4206         float EyeVectorslope[4];
4207         float z;
4208         float diffusetex[4];
4209         float glosstex[4];
4210         float surfacenormal[4];
4211         float lightnormal[4];
4212         float eyenormal[4];
4213         float specularnormal[4];
4214         float diffuse;
4215         float specular;
4216         float SpecularPower;
4217         float CubeVector[4];
4218         float attenuation;
4219         int d[4];
4220         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4221         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4222         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4223         Color_Glow[3] = 0.0f;
4224         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4225         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4226         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4227         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4228         Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4229         Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4230         Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4231         Color_Diffuse[3] = 0.0f;
4232         Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4233         Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4234         Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4235         Color_Specular[3] = 0.0f;
4236         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4237         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4238         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4239         Color_Pants[3] = 0.0f;
4240         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4241         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4242         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4243         Color_Shirt[3] = 0.0f;
4244         LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4245         LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4246         LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4247         LightColor[3] = 0.0f;
4248         SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4249         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4250         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4251         DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4252         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4253         memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4254         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4255         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4256         {
4257                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4258                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4259         }
4260         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4261                 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4262         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4263         {
4264                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4265                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4266                 for (x = startx;x < endx;x++)
4267                 {
4268                         z = buffer_z[x];
4269                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4270                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4271                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4272                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4273                         if (attenuation < 0.01f)
4274                                 continue;
4275                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4276                         {
4277                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4278                                 if (attenuation < 0.01f)
4279                                         continue;
4280                         }
4281
4282                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4283                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4284                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4285                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4286                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4287                         {
4288                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4289                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4290                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4291                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4292                         }
4293                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4294                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4295                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4296                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4297                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4298                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4299                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4300                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4301
4302                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4303                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4304                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4305                         DPSOFTRAST_Vector3Normalize(lightnormal);
4306
4307                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4308
4309                         if(thread->shader_exactspecularmath)
4310                         {
4311                                 // reflect lightnormal at surfacenormal, take the negative of that
4312                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4313                                 float f;
4314                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4315                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4316                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4317                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4318
4319                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
4320                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4321                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4322                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4323                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4324
4325                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4326                         }
4327                         else
4328                         {
4329                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4330                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4331                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4332                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4333
4334                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
4335                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
4336                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
4337                                 DPSOFTRAST_Vector3Normalize(specularnormal);
4338
4339                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4340                         }
4341                         specular = pow(specular, SpecularPower * glosstex[3]);
4342
4343                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4344                         {
4345                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4346                                 attenuation *= (1.0f / 255.0f);
4347                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4348                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4349                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4350                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4351                         }
4352                         else
4353                         {
4354                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4355                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4356                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4357                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4358                         }
4359                         buffer_FragColorbgra8[x*4+0] = d[0];
4360                         buffer_FragColorbgra8[x*4+1] = d[1];
4361                         buffer_FragColorbgra8[x*4+2] = d[2];
4362                         buffer_FragColorbgra8[x*4+3] = d[3];
4363                 }
4364         }
4365         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4366         {
4367                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4368                 for (x = startx;x < endx;x++)
4369                 {
4370                         z = buffer_z[x];
4371                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4372                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4373                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4374                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4375                         if (attenuation < 0.01f)
4376                                 continue;
4377                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4378                         {
4379                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4380                                 if (attenuation < 0.01f)
4381                                         continue;
4382                         }
4383
4384                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4385                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4386                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4387                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4388                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4389                         {
4390                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4391                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4392                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4393                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4394                         }
4395                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4396                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4397                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4398                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4399
4400                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4401                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4402                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4403                         DPSOFTRAST_Vector3Normalize(lightnormal);
4404
4405                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4406                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4407                         {
4408                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4409                                 attenuation *= (1.0f / 255.0f);
4410                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4411                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4412                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4413                                 d[3] = (int)( diffusetex[3]                                                                                                                   );if (d[3] > 255) d[3] = 255;
4414                         }
4415                         else
4416                         {
4417                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4418                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4419                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4420                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4421                         }
4422                         buffer_FragColorbgra8[x*4+0] = d[0];
4423                         buffer_FragColorbgra8[x*4+1] = d[1];
4424                         buffer_FragColorbgra8[x*4+2] = d[2];
4425                         buffer_FragColorbgra8[x*4+3] = d[3];
4426                 }
4427         }
4428         else
4429         {
4430                 for (x = startx;x < endx;x++)
4431                 {
4432                         z = buffer_z[x];
4433                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4434                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4435                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4436                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4437                         if (attenuation < 0.01f)
4438                                 continue;
4439                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4440                         {
4441                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4442                                 if (attenuation < 0.01f)
4443                                         continue;
4444                         }
4445
4446                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4447                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4448                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4449                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4450                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4451                         {
4452                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4453                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4454                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4455                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4456                         }
4457                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4458                         {
4459                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4460                                 attenuation *= (1.0f / 255.0f);
4461                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4462                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4463                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4464                                 d[3] = (int)( diffusetex[3]                                                                                      );if (d[3] > 255) d[3] = 255;
4465                         }
4466                         else
4467                         {
4468                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4469                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4470                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4471                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4472                         }
4473                         buffer_FragColorbgra8[x*4+0] = d[0];
4474                         buffer_FragColorbgra8[x*4+1] = d[1];
4475                         buffer_FragColorbgra8[x*4+2] = d[2];
4476                         buffer_FragColorbgra8[x*4+3] = d[3];
4477                 }
4478         }
4479         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4480 #endif
4481 }
4482
4483
4484
4485 void DPSOFTRAST_VertexShader_Refraction(void)
4486 {
4487         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4488         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4489         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4490 }
4491
4492 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4493 {
4494         // DIRTY TRICK: only do sideways displacement. Not correct, but cheaper and thus better for SW.
4495
4496         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4497         float z;
4498         int x, startx = span->startx, endx = span->endx;
4499
4500         // texture reads
4501         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4502         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4503
4504         // varyings
4505         float ModelViewProjectionPositiondata[4];
4506         float ModelViewProjectionPositionslope[4];
4507
4508         // uniforms
4509         float ScreenScaleRefractReflect[2];
4510         float ScreenCenterRefractReflect[2];
4511         float DistortScaleRefractReflect[2];
4512         float RefractColor[4];
4513
4514         const unsigned char * RESTRICT pixelbase;
4515         const unsigned char * RESTRICT pixel[4];
4516         DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4517         if(!texture) return;
4518         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[0][0];
4519
4520         // read textures
4521         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4522         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4523
4524         // read varyings
4525         DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD1); // or POSITION?
4526
4527         // read uniforms
4528         ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4529         ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4530         ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4531         ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4532         DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4533         DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4534         RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4535         RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4536         RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4537         RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4538
4539         // do stuff
4540         for (x = startx;x < endx;x++)
4541         {
4542                 float SafeScreenTexCoord[2];
4543                 float ScreenTexCoord[2];
4544                 float v[3];
4545                 float iw;
4546                 unsigned char c[4];
4547
4548                 z = buffer_z[x];
4549
4550                 // "    vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4551                 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4552                 
4553                 // "    vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4554                 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4555                 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4556
4557                 // "    vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4558                 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4559                 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4560                 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4561                 DPSOFTRAST_Vector3Normalize(v);
4562                 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4563                 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4564
4565                 // "    dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4566                 if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4567                 {
4568                         unsigned int tc[2] = { ScreenTexCoord[0] * (texture->mipmap[0][2]<<12) - 2048, ScreenTexCoord[1] * (texture->mipmap[0][3]<<12) - 2048};
4569                         unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
4570                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
4571                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
4572                         int tci[2] = { tc[0]>>12, tc[1]>>12 };
4573                         int tci1[2] = { tci[0] + 1, tci[1] + 1 };
4574                         tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4575                         tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4576                         tci1[0] = tci1[0] >= 0 ? (tci1[0] <= texture->mipmap[0][2]-1 ? tci1[0] : texture->mipmap[0][2]-1) : 0;
4577                         tci1[1] = tci1[1] >= 0 ? (tci1[1] <= texture->mipmap[0][3]-1 ? tci1[1] : texture->mipmap[0][3]-1) : 0;
4578                         pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4579                         pixel[1] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci1[0]);
4580                         pixel[2] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci[0]);
4581                         pixel[3] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci1[0]);
4582                         c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
4583                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
4584                         c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
4585                 }
4586                 else
4587                 {
4588                         int tci[2] = { ScreenTexCoord[0] * texture->mipmap[0][2], ScreenTexCoord[1] * texture->mipmap[0][3] };
4589                         tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4590                         tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4591                         pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4592                         c[0] = pixel[0][0];
4593                         c[1] = pixel[0][1];
4594                         c[2] = pixel[0][2];
4595                 }
4596
4597                 //p = (int) bound(startx, x + (ScreenTexCoord[0] - SafeScreenTexCoord[0]) / (ModelViewProjectionPositionslope[0]*z), endx-1);
4598                 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4599                 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4600                 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4601                 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4602         }
4603
4604         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4605 }
4606
4607
4608
4609 void DPSOFTRAST_VertexShader_Water(void)
4610 {
4611         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4612 }
4613
4614
4615 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4616 {
4617         // TODO: IMPLEMENT
4618         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4619         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4620         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4621         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4622         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4623 }
4624
4625
4626
4627 void DPSOFTRAST_VertexShader_ShowDepth(void)
4628 {
4629         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4630 }
4631
4632 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4633 {
4634         // TODO: IMPLEMENT
4635         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4636         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4637         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4638         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4639         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4640 }
4641
4642
4643
4644 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4645 {
4646         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4647 }
4648
4649 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4650 {
4651         // TODO: IMPLEMENT
4652         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4653         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4654         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4655         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4656         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4657 }
4658
4659
4660
4661 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4662 {
4663         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4664 }
4665
4666 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4667 {
4668         // TODO: IMPLEMENT
4669         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4670         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4671         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4672         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4673         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4674 }
4675
4676
4677
4678 typedef struct DPSOFTRAST_ShaderModeInfo_s
4679 {
4680         int lodarrayindex;
4681         void (*Vertex)(void);
4682         void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4683         unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4684         unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4685 }
4686 DPSOFTRAST_ShaderModeInfo;
4687
4688 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4689 {
4690         {2, DPSOFTRAST_VertexShader_Generic,                        DPSOFTRAST_PixelShader_Generic,                        {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4691         {2, DPSOFTRAST_VertexShader_PostProcess,                    DPSOFTRAST_PixelShader_PostProcess,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4692         {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow,                DPSOFTRAST_PixelShader_Depth_Or_Shadow,                {~0}, {~0}},
4693         {2, DPSOFTRAST_VertexShader_FlatColor,                      DPSOFTRAST_PixelShader_FlatColor,                      {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4694         {2, DPSOFTRAST_VertexShader_VertexColor,                    DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4695         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4696         {2, DPSOFTRAST_VertexShader_FakeLight,                      DPSOFTRAST_PixelShader_FakeLight,                      {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4697         {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace,   {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4698         {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4699         {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,                 {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4700         {2, DPSOFTRAST_VertexShader_LightSource,                    DPSOFTRAST_PixelShader_LightSource,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4701         {2, DPSOFTRAST_VertexShader_Refraction,                     DPSOFTRAST_PixelShader_Refraction,                     {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4702         {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                          {~0}},
4703         {2, DPSOFTRAST_VertexShader_ShowDepth,                      DPSOFTRAST_PixelShader_ShowDepth,                      {~0}},
4704         {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,               {~0}},
4705         {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}},
4706 };
4707
4708 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4709 {
4710         int i;
4711         int x;
4712         int startx;
4713         int endx;
4714 //      unsigned int c;
4715 //      unsigned int *colorpixel;
4716         unsigned int *depthpixel;
4717         float w;
4718         float wslope;
4719         int depth;
4720         int depthslope;
4721         unsigned int d;
4722         DPSOFTRAST_State_Triangle *triangle;
4723         DPSOFTRAST_State_Span *span;
4724         unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH+4]; // LordHavoc: padded to allow some termination bytes
4725         for (i = 0; i < thread->numspans; i++)
4726         {
4727                 span = &thread->spans[i];
4728                 triangle = &thread->triangles[span->triangle];
4729                 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4730                 {
4731                         wslope = triangle->w[0];
4732                         w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4733                         depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4734                         depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4735                         depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4736                         startx = span->startx;
4737                         endx = span->endx;
4738                         switch(thread->fb_depthfunc)
4739                         {
4740                         default:
4741                         case GL_ALWAYS:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4742                         case GL_LESS:    for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4743                         case GL_LEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4744                         case GL_EQUAL:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4745                         case GL_GEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4746                         case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4747                         case GL_NEVER:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4748                         }
4749                         //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4750                         //for (x = startx;x < endx;x++)
4751                         //      colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4752                         // if there is no color buffer, skip pixel shader
4753                         while (startx < endx && !pixelmask[startx])
4754                                 startx++;
4755                         while (endx > startx && !pixelmask[endx-1])
4756                                 endx--;
4757                         if (startx >= endx)
4758                                 continue; // no pixels to fill
4759                         span->pixelmask = pixelmask;
4760                         span->startx = startx;
4761                         span->endx = endx;
4762                         // run pixel shader if appropriate
4763                         // do this before running depthmask code, to allow the pixelshader
4764                         // to clear pixelmask values for alpha testing
4765                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4766                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4767                         if (thread->depthmask)
4768                                 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4769                                         if (pixelmask[x])
4770                                                 depthpixel[x] = d;
4771                 }
4772                 else
4773                 {
4774                         // no depth testing means we're just dealing with color...
4775                         // if there is no color buffer, skip pixel shader
4776                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4777                         {
4778                                 memset(pixelmask + span->startx, 1, span->endx - span->startx);
4779                                 span->pixelmask = pixelmask;
4780                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4781                         }
4782                 }
4783         }
4784         thread->numspans = 0;
4785 }
4786
4787 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4788
4789 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4790 {
4791 #ifdef SSE_POSSIBLE
4792         int cullface = thread->cullface;
4793         int minx, maxx, miny, maxy;
4794         int miny1, maxy1, miny2, maxy2;
4795         __m128i fbmin, fbmax;
4796         __m128 viewportcenter, viewportscale;
4797         int firstvertex = command->firstvertex;
4798         int numvertices = command->numvertices;
4799         int numtriangles = command->numtriangles;
4800         const int *element3i = command->element3i;
4801         const unsigned short *element3s = command->element3s;
4802         int clipped = command->clipped;
4803         int i;
4804         int j;
4805         int k;
4806         int y;
4807         int e[3];
4808         __m128i screeny;
4809         int starty, endy, bandy;
4810         int numpoints;
4811         int clipcase;
4812         float clipdist[4];
4813         float clip0origin, clip0slope;
4814         int clip0dir;
4815         __m128 triangleedge1, triangleedge2, trianglenormal;
4816         __m128 clipfrac[3];
4817         __m128 screen[4];
4818         DPSOFTRAST_State_Triangle *triangle;
4819         DPSOFTRAST_Texture *texture;
4820         DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4821         miny = thread->fb_scissor[1];
4822         maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4823         miny1 = bound(miny, thread->miny1, maxy);
4824         maxy1 = bound(miny, thread->maxy1, maxy);
4825         miny2 = bound(miny, thread->miny2, maxy);
4826         maxy2 = bound(miny, thread->maxy2, maxy);
4827         if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4828         {
4829                 if (!ATOMIC_DECREMENT(command->refcount))
4830                 {
4831                         if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4832                                 MM_FREE(command->arrays);
4833                 }
4834                 return;
4835         }
4836         minx = thread->fb_scissor[0];
4837         maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4838         fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4839         fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4840         viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4841         viewportscale = _mm_load_ps(thread->fb_viewportscale);
4842         screen[3] = _mm_setzero_ps();
4843         clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4844         for (i = 0;i < numtriangles;i++)
4845         {
4846                 const float *screencoord4f = command->arrays;
4847                 const float *arrays = screencoord4f + numvertices*4;
4848
4849                 // generate the 3 edges of this triangle
4850                 // generate spans for the triangle - switch based on left split or right split classification of triangle
4851                 if (element3s)
4852                 {
4853                         e[0] = element3s[i*3+0] - firstvertex;
4854                         e[1] = element3s[i*3+1] - firstvertex;
4855                         e[2] = element3s[i*3+2] - firstvertex;
4856                 }
4857                 else if (element3i)
4858                 {
4859                         e[0] = element3i[i*3+0] - firstvertex;
4860                         e[1] = element3i[i*3+1] - firstvertex;
4861                         e[2] = element3i[i*3+2] - firstvertex;
4862                 }
4863                 else
4864                 {
4865                         e[0] = i*3+0;
4866                         e[1] = i*3+1;
4867                         e[2] = i*3+2;
4868                 }
4869
4870 #define SKIPBACKFACE \
4871                 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4872                 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4873                 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4874                 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4875                                                                         _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4876                 switch(cullface) \
4877                 { \
4878                 case GL_BACK: \
4879                         if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4880                                 continue; \
4881                         break; \
4882                 case GL_FRONT: \
4883                         if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4884                                 continue; \
4885                         break; \
4886                 }
4887
4888 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4889                         clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4890                         { \
4891                                 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4892                                 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4893                         }
4894 #define CLIPPEDVERTEXCOPY(k,p1) \
4895                         screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4896
4897 #define GENATTRIBCOPY(attrib, p1) \
4898                 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4899 #define GENATTRIBLERP(attrib, p1, p2) \
4900                 { \
4901                         __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4902                         attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4903                 }
4904 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4905                 switch(clipcase) \
4906                 { \
4907                 default: \
4908                 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4909                 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4910                 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4911                 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4912                 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4913                 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4914                 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4915                 }
4916
4917                 if (! clipped)
4918                         goto notclipped;
4919
4920                 // calculate distance from nearplane
4921                 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4922                 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4923                 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4924                 if (clipdist[0] >= 0.0f)
4925                 {
4926                         if (clipdist[1] >= 0.0f)
4927                         {
4928                                 if (clipdist[2] >= 0.0f)
4929                                 {
4930                                 notclipped:
4931                                         // triangle is entirely in front of nearplane
4932                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4933                                         SKIPBACKFACE;
4934                                         numpoints = 3;
4935                                         clipcase = 0;
4936                                 }
4937                                 else
4938                                 {
4939                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4940                                         SKIPBACKFACE;
4941                                         numpoints = 4;
4942                                         clipcase = 1;
4943                                 }
4944                         }
4945                         else
4946                         {
4947                                 if (clipdist[2] >= 0.0f)
4948                                 {
4949                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4950                                         SKIPBACKFACE;
4951                                         numpoints = 4;
4952                                         clipcase = 2;
4953                                 }
4954                                 else
4955                                 {
4956                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4957                                         SKIPBACKFACE;
4958                                         numpoints = 3;
4959                                         clipcase = 3;
4960                                 }
4961                         }
4962                 }
4963                 else if (clipdist[1] >= 0.0f)
4964                 {
4965                         if (clipdist[2] >= 0.0f)
4966                         {
4967                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4968                                 SKIPBACKFACE;
4969                                 numpoints = 4;
4970                                 clipcase = 4;
4971                         }
4972                         else
4973                         {
4974                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4975                                 SKIPBACKFACE;
4976                                 numpoints = 3;
4977                                 clipcase = 5;
4978                         }
4979                 }
4980                 else if (clipdist[2] >= 0.0f)
4981                 {
4982                         CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4983                         SKIPBACKFACE;
4984                         numpoints = 3;
4985                         clipcase = 6;
4986                 }
4987                 else continue; // triangle is entirely behind nearplane
4988
4989                 {
4990                         // calculate integer y coords for triangle points
4991                         __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4992                                         screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4993                                         screenmin = _mm_min_epi16(screeni, screenir),
4994                                         screenmax = _mm_max_epi16(screeni, screenir);
4995                         screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4996                         screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4997                         screenmin = _mm_max_epi16(screenmin, fbmin);
4998                         screenmax = _mm_min_epi16(screenmax, fbmax);
4999                         // skip offscreen triangles
5000                         if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
5001                                 continue;
5002                         starty = _mm_extract_epi16(screenmin, 1);
5003                         endy = _mm_extract_epi16(screenmax, 1)+1;
5004                         if (starty >= maxy1 && endy <= miny2)
5005                                 continue;
5006                         screeny = _mm_srai_epi32(screeni, 16);
5007                 }
5008
5009                 triangle = &thread->triangles[thread->numtriangles];
5010
5011                 // calculate attribute plans for triangle data...
5012                 // okay, this triangle is going to produce spans, we'd better project
5013                 // the interpolants now (this is what gives perspective texturing),
5014                 // this consists of simply multiplying all arrays by the W coord
5015                 // (which is basically 1/Z), which will be undone per-pixel
5016                 // (multiplying by Z again) to get the perspective-correct array
5017                 // values
5018                 {
5019                         __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
5020                         __m128 mipedgescale, mipdensity;
5021                         attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
5022                         attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
5023                         attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
5024                         attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
5025                         attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
5026                         w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
5027                         w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
5028                         w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
5029                         attribedge1 = _mm_sub_ss(w0, w1);
5030                         attribedge2 = _mm_sub_ss(w2, w1);
5031                         attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5032                         attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5033                         x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
5034                         y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
5035                         attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5036                         _mm_store_ss(&triangle->w[0], attribxslope);
5037                         _mm_store_ss(&triangle->w[1], attribyslope);
5038                         _mm_store_ss(&triangle->w[2], attriborigin);
5039                         
5040                         clip0origin = 0;
5041                         clip0slope = 0;
5042                         clip0dir = 0;
5043                         if(thread->clipplane[0] || thread->clipplane[1] || thread->clipplane[2])
5044                         {
5045                                 float cliporigin, clipxslope, clipyslope;
5046                                 attriborigin = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(2, 2, 2, 2));
5047                                 attribedge1 = _mm_sub_ss(_mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5048                                 attribedge2 = _mm_sub_ss(_mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5049                                 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5050                                 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5051                                 attriborigin = _mm_sub_ss(attriborigin, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5052                                 cliporigin = _mm_cvtss_f32(attriborigin)*thread->clipplane[2] + thread->clipplane[3];
5053                                 clipxslope = thread->clipplane[0] + _mm_cvtss_f32(attribxslope)*thread->clipplane[2];
5054                                 clipyslope = thread->clipplane[1] + _mm_cvtss_f32(attribyslope)*thread->clipplane[2];
5055                                 if(clipxslope != 0)
5056                                 {
5057                                         clip0origin = -cliporigin/clipxslope;
5058                                         clip0slope = -clipyslope/clipxslope;
5059                                         clip0dir = clipxslope > 0 ? 1 : -1;
5060                                 }
5061                                 else if(clipyslope > 0)
5062                                 {
5063                                         clip0origin = dpsoftrast.fb_width*floor(cliporigin/clipyslope);
5064                                         clip0slope = dpsoftrast.fb_width;
5065                                         clip0dir = -1;
5066                                 }
5067                                 else if(clipyslope < 0)
5068                                 {
5069                                         clip0origin = dpsoftrast.fb_width*ceil(cliporigin/clipyslope);
5070                                         clip0slope = -dpsoftrast.fb_width;
5071                                         clip0dir = -1;
5072                                 }
5073                                 else if(clip0origin < 0) continue;
5074                         }
5075
5076                         mipedgescale = _mm_setzero_ps();
5077                         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
5078                         {
5079                                 __m128 attrib0, attrib1, attrib2;
5080                                 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
5081                                 if (k >= DPSOFTRAST_ARRAY_TOTAL)
5082                                         break;
5083                                 arrays += numvertices*4;
5084                                 GENATTRIBS(attrib0, attrib1, attrib2);
5085                                 attriborigin = _mm_mul_ps(attrib1, w1);
5086                                 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
5087                                 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
5088                                 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
5089                                 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
5090                                 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
5091                                 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
5092                                 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
5093                                 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
5094                                 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
5095                                 {
5096                                         mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
5097                                         mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
5098                                         mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
5099                                         mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
5100                                 }
5101                         }
5102
5103                         memset(triangle->mip, 0, sizeof(triangle->mip));
5104                         for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
5105                         {
5106                                 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
5107                                 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
5108                                         break;
5109                                 texture = thread->texbound[texunit];
5110                                 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
5111                                 {
5112                                         mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
5113                                         mipdensity = _mm_mul_ps(mipdensity, mipdensity);
5114                                         mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
5115                                         mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
5116                                         // this will be multiplied in the texturing routine by the texture resolution
5117                                         y = _mm_cvtss_si32(mipdensity);
5118                                         if (y > 0)
5119                                         {
5120                                                 y = (int)(log((float)y)*0.5f/M_LN2);
5121                                                 if (y > texture->mipmaps - 1)
5122                                                         y = texture->mipmaps - 1;
5123                                                 triangle->mip[texunit] = y;
5124                                         }
5125                                 }
5126                         }
5127                 }
5128         
5129                 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
5130                 for (; y < bandy;)
5131                 {
5132                         __m128 xcoords, xslope;
5133                         __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
5134                         int yccmask = _mm_movemask_epi8(ycc);
5135                         int edge0p, edge0n, edge1p, edge1n;
5136                         int nexty;
5137                         float clip0;
5138                         if (numpoints == 4)
5139                         {
5140                                 switch(yccmask)
5141                                 {
5142                                 default:
5143                                 case 0xFFFF: /*0000*/ y = endy; continue;
5144                                 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
5145                                 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5146                                 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
5147                                 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
5148                                 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5149                                 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5150                                 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5151                                 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5152                                 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5153                                 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5154                                 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5155                                 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5156                                 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5157                                 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5158                                 case 0x0000: /*1111*/ y++; continue;
5159                                 }
5160                         }
5161                         else
5162                         {
5163                                 switch(yccmask)
5164                                 {
5165                                 default:
5166                                 case 0xFFFF: /*000*/ y = endy; continue;
5167                                 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5168                                 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5169                                 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5170                                 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5171                                 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5172                                 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5173                                 case 0x0000: /*111*/ y++; continue;
5174                                 }
5175                         }
5176                         ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5177                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5178                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5179                         nexty = _mm_extract_epi16(ycc, 0);
5180                         if (nexty >= bandy) nexty = bandy-1;
5181                         xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5182                         xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5183                         xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5184                                                                 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5185                         xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5186                         if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5187                         {
5188                                 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5189                                 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5190                         }
5191                         clip0 = clip0origin + (y+0.5f)*clip0slope;
5192                         for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope), clip0 += clip0slope)
5193                         {
5194                                 int startx, endx, clipx = minx, offset;
5195                                 startx = _mm_cvtss_si32(xcoords);
5196                                 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5197                                 if (startx < minx) 
5198                                 {
5199                                         if (startx < 0) startx = 0;
5200                                         startx += (minx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
5201                                 }
5202                                 if (endx > maxx) endx = maxx;
5203                                 if (startx >= endx) continue;
5204
5205                                 if (clip0dir)
5206                                 {
5207                                         if (clip0dir > 0)
5208                                         {
5209                                                 if (startx < clip0) 
5210                                                 {
5211                                                         if(endx <= clip0) continue;
5212                                                         clipx = max((int)clip0, minx);
5213                                                         startx += (clipx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1); 
5214                                                 }
5215                                         }
5216                                         else if (endx > clip0) 
5217                                         {
5218                                                 if(startx >= clip0) continue;
5219                                                 endx = (int)clip0;
5220                                         }
5221                                 }
5222                                                 
5223                                 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5224                                 {
5225                                         DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5226                                         span->triangle = thread->numtriangles;
5227                                         span->x = offset;
5228                                         span->y = y;
5229                                         span->startx = max(clipx - offset, 0);
5230                                         span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5231                                         if (span->startx >= span->endx)
5232                                                 continue; 
5233                                         if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5234                                                 DPSOFTRAST_Draw_ProcessSpans(thread);
5235                                 }
5236                         }
5237                 }
5238
5239                 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5240                 {
5241                         DPSOFTRAST_Draw_ProcessSpans(thread);
5242                         thread->numtriangles = 0;
5243                 }
5244         }
5245
5246         if (!ATOMIC_DECREMENT(command->refcount))
5247         {
5248                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5249                         MM_FREE(command->arrays);
5250         }
5251
5252         if (thread->numspans > 0 || thread->numtriangles > 0)
5253         {
5254                 DPSOFTRAST_Draw_ProcessSpans(thread);
5255                 thread->numtriangles = 0;
5256         }
5257 #endif
5258 }
5259
5260 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5261 {
5262         int i;
5263         int j;
5264         int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5265         int datasize = 2*numvertices*sizeof(float[4]);
5266         DPSOFTRAST_Command_Draw *command;
5267         unsigned char *data;
5268         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5269         {
5270                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5271                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5272                         break;
5273                 datasize += numvertices*sizeof(float[4]);
5274         }
5275         if (element3s)
5276                 datasize += numtriangles*sizeof(unsigned short[3]);
5277         else if (element3i)
5278                 datasize += numtriangles*sizeof(int[3]);
5279         datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5280         if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5281         {
5282                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5283                 data = (unsigned char *)MM_CALLOC(datasize, 1);
5284         }
5285         else
5286         {
5287                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5288                 data = (unsigned char *)command + commandsize;
5289         }
5290         command->firstvertex = firstvertex;
5291         command->numvertices = numvertices;
5292         command->numtriangles = numtriangles;
5293         command->arrays = (float *)data;
5294         memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5295         dpsoftrast.firstvertex = firstvertex;
5296         dpsoftrast.numvertices = numvertices;
5297         dpsoftrast.screencoord4f = (float *)data;
5298         data += numvertices*sizeof(float[4]);
5299         dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5300         data += numvertices*sizeof(float[4]);
5301         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5302         {
5303                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5304                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5305                         break;
5306                 dpsoftrast.post_array4f[j] = (float *)data;
5307                 data += numvertices*sizeof(float[4]);
5308         }
5309         command->element3i = NULL;
5310         command->element3s = NULL;
5311         if (element3s)
5312         {
5313                 command->element3s = (unsigned short *)data;
5314                 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5315         }
5316         else if (element3i)
5317         {
5318                 command->element3i = (int *)data;
5319                 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5320         }
5321         return command;
5322 }
5323
5324 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5325 {
5326         DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5327         DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5328         command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5329         command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5330         if (command->starty >= command->endy)
5331         {
5332                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5333                         MM_FREE(command->arrays);
5334                 DPSOFTRAST_UndoCommand(command->commandsize);
5335                 return;
5336         }
5337         command->clipped = dpsoftrast.drawclipped;
5338         command->refcount = dpsoftrast.numthreads;
5339
5340         if (dpsoftrast.usethreads)
5341         {
5342                 int i;
5343                 DPSOFTRAST_Draw_SyncCommands();
5344                 for (i = 0; i < dpsoftrast.numthreads; i++)
5345                 {
5346                         DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5347                         if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5348                                 Thread_CondSignal(thread->drawcond);
5349                 }
5350         }
5351         else
5352         {
5353                 DPSOFTRAST_Draw_FlushThreads();
5354         }
5355 }
5356
5357 DEFCOMMAND(23, SetRenderTargets, int width; int height;);
5358 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5359 {
5360         thread->validate |= DPSOFTRAST_VALIDATE_FB;
5361 }
5362 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5363 {
5364         DPSOFTRAST_Command_SetRenderTargets *command;
5365         if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5366                 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5367                 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5368                 DPSOFTRAST_Flush();
5369         dpsoftrast.fb_width = width;
5370         dpsoftrast.fb_height = height;
5371         dpsoftrast.fb_depthpixels = depthpixels;
5372         dpsoftrast.fb_colorpixels[0] = colorpixels0;
5373         dpsoftrast.fb_colorpixels[1] = colorpixels1;
5374         dpsoftrast.fb_colorpixels[2] = colorpixels2;
5375         dpsoftrast.fb_colorpixels[3] = colorpixels3;
5376         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5377         command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5378         command->width = width;
5379         command->height = height;
5380 }
5381  
5382 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5383 {
5384         int commandoffset = thread->commandoffset;
5385         while (commandoffset != endoffset)
5386         {
5387                 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5388                 switch (command->opcode)
5389                 {
5390 #define INTERPCOMMAND(name) \
5391                 case DPSOFTRAST_OPCODE_##name : \
5392                         DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5393                         commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5394                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5395                                 commandoffset = 0; \
5396                         break;
5397                 INTERPCOMMAND(Viewport)
5398                 INTERPCOMMAND(ClearColor)
5399                 INTERPCOMMAND(ClearDepth)
5400                 INTERPCOMMAND(ColorMask)
5401                 INTERPCOMMAND(DepthTest)
5402                 INTERPCOMMAND(ScissorTest)
5403                 INTERPCOMMAND(Scissor)
5404                 INTERPCOMMAND(BlendFunc)
5405                 INTERPCOMMAND(BlendSubtract)
5406                 INTERPCOMMAND(DepthMask)
5407                 INTERPCOMMAND(DepthFunc)
5408                 INTERPCOMMAND(DepthRange)
5409                 INTERPCOMMAND(PolygonOffset)
5410                 INTERPCOMMAND(CullFace)
5411                 INTERPCOMMAND(AlphaTest)
5412                 INTERPCOMMAND(AlphaFunc)
5413                 INTERPCOMMAND(SetTexture)
5414                 INTERPCOMMAND(SetShader)
5415                 INTERPCOMMAND(Uniform4f)
5416                 INTERPCOMMAND(UniformMatrix4f)
5417                 INTERPCOMMAND(Uniform1i)
5418                 INTERPCOMMAND(SetRenderTargets)
5419                 INTERPCOMMAND(ClipPlane)
5420
5421                 case DPSOFTRAST_OPCODE_Draw:
5422                         DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5423                         commandoffset += command->commandsize;
5424                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5425                                 commandoffset = 0;
5426                         thread->commandoffset = commandoffset;
5427                         break;
5428
5429                 case DPSOFTRAST_OPCODE_Reset:
5430                         commandoffset = 0;
5431                         break;
5432                 }
5433         }
5434         thread->commandoffset = commandoffset;
5435 }
5436
5437 static int DPSOFTRAST_Draw_Thread(void *data)
5438 {
5439         DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5440         while(thread->index >= 0)
5441         {
5442                 if (thread->commandoffset != dpsoftrast.drawcommand)
5443                 {
5444                         DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);      
5445                 }
5446                 else 
5447                 {
5448                         Thread_LockMutex(thread->drawmutex);
5449                         if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5450                         {
5451                                 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5452                                 thread->starving = true;
5453                                 Thread_CondWait(thread->drawcond, thread->drawmutex);
5454                                 thread->starving = false;
5455                         }
5456                         Thread_UnlockMutex(thread->drawmutex);
5457                 }
5458         }   
5459         return 0;
5460 }
5461
5462 static void DPSOFTRAST_Draw_FlushThreads(void)
5463 {
5464         DPSOFTRAST_State_Thread *thread;
5465         int i;
5466         DPSOFTRAST_Draw_SyncCommands();
5467         if (dpsoftrast.usethreads) 
5468         {
5469                 for (i = 0; i < dpsoftrast.numthreads; i++)
5470                 {
5471                         thread = &dpsoftrast.threads[i];
5472                         if (thread->commandoffset != dpsoftrast.drawcommand)
5473                         {
5474                                 Thread_LockMutex(thread->drawmutex);
5475                                 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5476                                         Thread_CondSignal(thread->drawcond);
5477                                 Thread_UnlockMutex(thread->drawmutex);
5478                         }
5479                 }
5480                 for (i = 0; i < dpsoftrast.numthreads; i++)
5481                 {
5482                         thread = &dpsoftrast.threads[i];
5483                         if (thread->commandoffset != dpsoftrast.drawcommand)
5484                         {
5485                                 Thread_LockMutex(thread->drawmutex);
5486                                 if (thread->commandoffset != dpsoftrast.drawcommand)
5487                                 {
5488                                         thread->waiting = true;
5489                                         Thread_CondWait(thread->waitcond, thread->drawmutex);
5490                                         thread->waiting = false;
5491                                 }
5492                                 Thread_UnlockMutex(thread->drawmutex);
5493                         }
5494                 }
5495         }
5496         else
5497         {
5498                 for (i = 0; i < dpsoftrast.numthreads; i++)
5499                 {
5500                         thread = &dpsoftrast.threads[i];
5501                         if (thread->commandoffset != dpsoftrast.drawcommand)
5502                                 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5503                 }
5504         }
5505         dpsoftrast.commandpool.usedcommands = 0;
5506 }
5507
5508 void DPSOFTRAST_Flush(void)
5509 {
5510         DPSOFTRAST_Draw_FlushThreads();
5511 }
5512
5513 void DPSOFTRAST_Finish(void)
5514 {
5515         DPSOFTRAST_Flush();
5516 }
5517
5518 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5519 {
5520         int i;
5521         union
5522         {
5523                 int i;
5524                 unsigned char b[4];
5525         }
5526         u;
5527         u.i = 1;
5528         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5529         dpsoftrast.bigendian = u.b[3];
5530         dpsoftrast.fb_width = width;
5531         dpsoftrast.fb_height = height;
5532         dpsoftrast.fb_depthpixels = depthpixels;
5533         dpsoftrast.fb_colorpixels[0] = colorpixels;
5534         dpsoftrast.fb_colorpixels[1] = NULL;
5535         dpsoftrast.fb_colorpixels[1] = NULL;
5536         dpsoftrast.fb_colorpixels[1] = NULL;
5537         dpsoftrast.viewport[0] = 0;
5538         dpsoftrast.viewport[1] = 0;
5539         dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5540         dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5541         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5542         dpsoftrast.texture_firstfree = 1;
5543         dpsoftrast.texture_end = 1;
5544         dpsoftrast.texture_max = 0;
5545         dpsoftrast.color[0] = 1;
5546         dpsoftrast.color[1] = 1;
5547         dpsoftrast.color[2] = 1;
5548         dpsoftrast.color[3] = 1;
5549         dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5550         dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5551         dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5552         dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5553         for (i = 0; i < dpsoftrast.numthreads; i++)
5554         {
5555                 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5556                 thread->index = i;
5557                 thread->cullface = GL_BACK;
5558                 thread->colormask[1] = 1;
5559                 thread->colormask[2] = 1;
5560                 thread->colormask[3] = 1;
5561                 thread->blendfunc[0] = GL_ONE;
5562                 thread->blendfunc[1] = GL_ZERO;
5563                 thread->depthmask = true;
5564                 thread->depthtest = true;
5565                 thread->depthfunc = GL_LEQUAL;
5566                 thread->scissortest = false;
5567                 thread->alphatest = false;
5568                 thread->alphafunc = GL_GREATER;
5569                 thread->alphavalue = 0.5f;
5570                 thread->viewport[0] = 0;
5571                 thread->viewport[1] = 0;
5572                 thread->viewport[2] = dpsoftrast.fb_width;
5573                 thread->viewport[3] = dpsoftrast.fb_height;
5574                 thread->scissor[0] = 0;
5575                 thread->scissor[1] = 0;
5576                 thread->scissor[2] = dpsoftrast.fb_width;
5577                 thread->scissor[3] = dpsoftrast.fb_height;
5578                 thread->depthrange[0] = 0;
5579                 thread->depthrange[1] = 1;
5580                 thread->polygonoffset[0] = 0;
5581                 thread->polygonoffset[1] = 0;
5582                 thread->clipplane[0] = 0;
5583                 thread->clipplane[1] = 0;
5584                 thread->clipplane[2] = 0;
5585                 thread->clipplane[3] = 1;
5586         
5587                 DPSOFTRAST_RecalcThread(thread);
5588         
5589                 thread->numspans = 0;
5590                 thread->numtriangles = 0;
5591                 thread->commandoffset = 0;
5592                 thread->waiting = false;
5593                 thread->starving = false;
5594            
5595                 thread->validate = -1;
5596                 DPSOFTRAST_Validate(thread, -1);
5597  
5598                 if (dpsoftrast.usethreads)
5599                 {
5600                         thread->waitcond = Thread_CreateCond();
5601                         thread->drawcond = Thread_CreateCond();
5602                         thread->drawmutex = Thread_CreateMutex();
5603                         thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5604                 }
5605         }
5606         return 0;
5607 }
5608
5609 void DPSOFTRAST_Shutdown(void)
5610 {
5611         int i;
5612         if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5613         {
5614                 DPSOFTRAST_State_Thread *thread;
5615                 for (i = 0; i < dpsoftrast.numthreads; i++)
5616                 {
5617                         thread = &dpsoftrast.threads[i];
5618                         Thread_LockMutex(thread->drawmutex);
5619                         thread->index = -1;
5620                         Thread_CondSignal(thread->drawcond);
5621                         Thread_UnlockMutex(thread->drawmutex);
5622                         Thread_WaitThread(thread->thread, 0);
5623                         Thread_DestroyCond(thread->waitcond);
5624                         Thread_DestroyCond(thread->drawcond);
5625                         Thread_DestroyMutex(thread->drawmutex);
5626                 }
5627         }
5628         for (i = 0;i < dpsoftrast.texture_end;i++)
5629                 if (dpsoftrast.texture[i].bytes)
5630                         MM_FREE(dpsoftrast.texture[i].bytes);
5631         if (dpsoftrast.texture)
5632                 free(dpsoftrast.texture);
5633         if (dpsoftrast.threads)
5634                 MM_FREE(dpsoftrast.threads);
5635         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5636 }
5637