]> git.xonotic.org Git - xonotic/darkplaces.git/blob - dpsoftrast.c
initial support for user clipping plane in dpsoftrast
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "thread.h"
7 #include "dpsoftrast.h"
8
9 #ifdef _MSC_VER
10 #pragma warning(disable : 4324)
11 #endif
12
13 #ifndef __cplusplus
14 typedef qboolean bool;
15 #endif
16
17 #define ALIGN_SIZE 16
18 #define ATOMIC_SIZE 32
19
20 #ifdef SSE_POSSIBLE
21         #if defined(__APPLE__)
22                 #include <libkern/OSAtomic.h>
23                 #define ALIGN(var) var __attribute__((__aligned__(16)))
24                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
25                 #define MEMORY_BARRIER (_mm_sfence())
26                 #define ATOMIC_COUNTER volatile int32_t 
27                 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28                 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29                 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30         #elif defined(__GNUC__)
31                 #define ALIGN(var) var __attribute__((__aligned__(16)))
32                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
33                 #define MEMORY_BARRIER (_mm_sfence())
34                 //(__sync_synchronize())
35                 #define ATOMIC_COUNTER volatile int
36                 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
37                 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
38                 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
39         #elif defined(_MSC_VER)
40                 #define ALIGN(var) __declspec(align(16)) var
41                 #define ATOMIC(var) __declspec(align(32)) var
42                 #define MEMORY_BARRIER (_mm_sfence())
43                 //(MemoryBarrier())
44                 #define ATOMIC_COUNTER volatile LONG
45                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
46                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
47                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
48         #endif
49 #endif
50
51 #ifndef ALIGN
52 #define ALIGN(var) var
53 #endif
54 #ifndef ATOMIC
55 #define ATOMIC(var) var
56 #endif
57 #ifndef MEMORY_BARRIER
58 #define MEMORY_BARRIER ((void)0)
59 #endif
60 #ifndef ATOMIC_COUNTER
61 #define ATOMIC_COUNTER int
62 #endif
63 #ifndef ATOMIC_INCREMENT
64 #define ATOMIC_INCREMENT(counter) (++(counter))
65 #endif
66 #ifndef ATOMIC_DECREMENT
67 #define ATOMIC_DECREMENT(counter) (--(counter))
68 #endif
69 #ifndef ATOMIC_ADD
70 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
71 #endif
72
73 #ifdef SSE_POSSIBLE
74 #include <emmintrin.h>
75
76 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
77
78 static void *MM_CALLOC(size_t nmemb, size_t size)
79 {
80         void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
81         if (ptr != NULL) memset(ptr, 0, nmemb*size);
82         return ptr;
83 }
84
85 #define MM_FREE _mm_free
86 #else
87 #define MM_MALLOC(size) malloc(size)
88 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
89 #define MM_FREE free
90 #endif
91
92 typedef enum DPSOFTRAST_ARRAY_e
93 {
94         DPSOFTRAST_ARRAY_POSITION,
95         DPSOFTRAST_ARRAY_COLOR,
96         DPSOFTRAST_ARRAY_TEXCOORD0,
97         DPSOFTRAST_ARRAY_TEXCOORD1,
98         DPSOFTRAST_ARRAY_TEXCOORD2,
99         DPSOFTRAST_ARRAY_TEXCOORD3,
100         DPSOFTRAST_ARRAY_TEXCOORD4,
101         DPSOFTRAST_ARRAY_TEXCOORD5,
102         DPSOFTRAST_ARRAY_TEXCOORD6,
103         DPSOFTRAST_ARRAY_TEXCOORD7,
104         DPSOFTRAST_ARRAY_TOTAL
105 }
106 DPSOFTRAST_ARRAY;
107
108 typedef struct DPSOFTRAST_Texture_s
109 {
110         int flags;
111         int width;
112         int height;
113         int depth;
114         int sides;
115         DPSOFTRAST_TEXTURE_FILTER filter;
116         int mipmaps;
117         int size;
118         ATOMIC_COUNTER binds;
119         unsigned char *bytes;
120         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
121 }
122 DPSOFTRAST_Texture;
123
124 #define COMMAND_SIZE ALIGN_SIZE
125 #define COMMAND_ALIGN(var) ALIGN(var)
126
127 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
128 {
129         unsigned char opcode;
130         unsigned short commandsize;
131 }
132 DPSOFTRAST_Command);
133
134 enum { DPSOFTRAST_OPCODE_Reset = 0 };
135
136 #define DEFCOMMAND(opcodeval, name, fields) \
137         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
138         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
139         { \
140                 unsigned char opcode; \
141                 unsigned short commandsize; \
142                 fields \
143         } DPSOFTRAST_Command_##name );
144
145 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
146 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
147
148 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
149 {
150         int freecommand;
151         int usedcommands;
152         ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
153 }
154 DPSOFTRAST_State_Command_Pool);
155
156 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
157 {
158         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
159         float w[3];
160         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
161 }
162 DPSOFTRAST_State_Triangle);
163
164 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
165         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
166         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
167                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
168                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
169 }
170 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
171         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
172         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
173         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
174         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
175         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
176         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
177         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
178         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
179 }
180                                         
181 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
182
183 typedef ALIGN(struct DPSOFTRAST_State_Span_s
184 {
185         int triangle; // triangle this span was generated by
186         int x; // framebuffer x coord
187         int y; // framebuffer y coord
188         int startx; // usable range (according to pixelmask)
189         int endx; // usable range (according to pixelmask)
190         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
191 }
192 DPSOFTRAST_State_Span);
193
194 #define DPSOFTRAST_DRAW_MAXSPANS 1024
195 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
196
197 #define DPSOFTRAST_VALIDATE_FB 1
198 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
199 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
200 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
201
202 typedef enum DPSOFTRAST_BLENDMODE_e
203 {
204         DPSOFTRAST_BLENDMODE_OPAQUE,
205         DPSOFTRAST_BLENDMODE_ALPHA,
206         DPSOFTRAST_BLENDMODE_ADDALPHA,
207         DPSOFTRAST_BLENDMODE_ADD,
208         DPSOFTRAST_BLENDMODE_INVMOD,
209         DPSOFTRAST_BLENDMODE_MUL,
210         DPSOFTRAST_BLENDMODE_MUL2,
211         DPSOFTRAST_BLENDMODE_SUBALPHA,
212         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
213         DPSOFTRAST_BLENDMODE_INVADD,
214         DPSOFTRAST_BLENDMODE_TOTAL
215 }
216 DPSOFTRAST_BLENDMODE;
217
218 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
219 {
220         void *thread;
221         int index;
222         
223         int cullface;
224         int colormask[4];
225         int blendfunc[2];
226         int blendsubtract;
227         int depthmask;
228         int depthtest;
229         int depthfunc;
230         int scissortest;
231         int alphatest;
232         int alphafunc;
233         float alphavalue;
234         int viewport[4];
235         int scissor[4];
236         float depthrange[2];
237         float polygonoffset[2];
238         ALIGN(float clipplane[4]);
239
240         int shader_mode;
241         int shader_permutation;
242         int shader_exactspecularmath;
243
244         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
245         
246         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
247         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
248
249         // DPSOFTRAST_VALIDATE_ flags
250         int validate;
251
252         // derived values (DPSOFTRAST_VALIDATE_FB)
253         int fb_colormask;
254         int fb_scissor[4];
255         ALIGN(float fb_viewportcenter[4]);
256         ALIGN(float fb_viewportscale[4]);
257
258         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
259         int fb_depthfunc;
260
261         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
262         int fb_blendmode;
263
264         // band boundaries
265         int miny1;
266         int maxy1;
267         int miny2;
268         int maxy2;
269
270         ATOMIC(volatile int commandoffset);
271
272         volatile bool waiting;
273         volatile bool starving;
274         void *waitcond;
275         void *drawcond;
276         void *drawmutex;
277
278         int numspans;
279         int numtriangles;
280         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
281         DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
282 }
283 DPSOFTRAST_State_Thread);
284
285 typedef ATOMIC(struct DPSOFTRAST_State_s
286 {
287         int fb_width;
288         int fb_height;
289         unsigned int *fb_depthpixels;
290         unsigned int *fb_colorpixels[4];
291
292         int viewport[4];
293         ALIGN(float fb_viewportcenter[4]);
294         ALIGN(float fb_viewportscale[4]);
295
296         float color[4];
297         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
298         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
299
300         const float *pointer_vertex3f;
301         const float *pointer_color4f;
302         const unsigned char *pointer_color4ub;
303         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
304         int stride_vertex;
305         int stride_color;
306         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
307         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
308         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
309
310         int firstvertex;
311         int numvertices;
312         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
313         float *screencoord4f;
314         int drawstarty;
315         int drawendy;
316         int drawclipped;
317         
318         int shader_mode;
319         int shader_permutation;
320         int shader_exactspecularmath;
321
322         int texture_max;
323         int texture_end;
324         int texture_firstfree;
325         DPSOFTRAST_Texture *texture;
326
327         int bigendian;
328
329         // error reporting
330         const char *errorstring;
331
332         bool usethreads;
333         int interlace;
334         int numthreads;
335         DPSOFTRAST_State_Thread *threads;
336
337         ATOMIC(volatile int drawcommand);
338
339         DPSOFTRAST_State_Command_Pool commandpool;
340 }
341 DPSOFTRAST_State);
342
343 DPSOFTRAST_State dpsoftrast;
344
345 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
346 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
347 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
348 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
349 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
350
351 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
352 {
353         fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
354         fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
355         fb_viewportcenter[3] = 0.5f;
356         fb_viewportcenter[0] = 0.0f;
357         fb_viewportscale[1] = 0.5f * viewport[2];
358         fb_viewportscale[2] = -0.5f * viewport[3];
359         fb_viewportscale[3] = 0.5f;
360         fb_viewportscale[0] = 1.0f;
361 }
362
363 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
364 {
365         if (dpsoftrast.interlace)
366         {
367                 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
368                 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
369                 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
370                 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
371         }
372         else
373         {
374                 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
375                 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
376         }
377 }
378
379 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
380 {
381         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
382         // and viewport projection values
383         int x1, x2;
384         int y1, y2;
385         x1 = thread->scissor[0];
386         x2 = thread->scissor[0] + thread->scissor[2];
387         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
388         y2 = dpsoftrast.fb_height - thread->scissor[1];
389         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
390         if (x1 < 0) x1 = 0;
391         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
392         if (y1 < 0) y1 = 0;
393         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
394         thread->fb_scissor[0] = x1;
395         thread->fb_scissor[1] = y1;
396         thread->fb_scissor[2] = x2 - x1;
397         thread->fb_scissor[3] = y2 - y1;
398
399         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
400         DPSOFTRAST_RecalcThread(thread);
401 }
402
403 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
404 {
405         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
406 }
407
408 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
409 {
410         if (thread->blendsubtract)
411         {
412                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
413                 {
414                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
415                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
416                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
417                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
418                 }
419         }
420         else
421         {       
422                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
423                 {
424                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
425                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
426                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
427                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
428                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
429                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
430                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
431                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
432                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
433                 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
434                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
435                 }
436         }
437 }
438
439 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
440
441 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
442 {
443         mask &= thread->validate;
444         if (!mask)
445                 return;
446         if (mask & DPSOFTRAST_VALIDATE_FB)
447         {
448                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
449                 DPSOFTRAST_RecalcFB(thread);
450         }
451         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
452         {
453                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
454                 DPSOFTRAST_RecalcDepthFunc(thread);
455         }
456         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
457         {
458                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
459                 DPSOFTRAST_RecalcBlendFunc(thread);
460         }
461 }
462
463 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
464 {
465         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
466                 return &dpsoftrast.texture[index];
467         return NULL;
468 }
469
470 static void DPSOFTRAST_Texture_Grow(void)
471 {
472         DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
473         DPSOFTRAST_State_Thread *thread;
474         int i;
475         int j;
476         DPSOFTRAST_Flush();
477         // expand texture array as needed
478         if (dpsoftrast.texture_max < 1024)
479                 dpsoftrast.texture_max = 1024;
480         else
481                 dpsoftrast.texture_max *= 2;
482         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
483         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
484                 if (dpsoftrast.texbound[i])
485                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
486         for (j = 0; j < dpsoftrast.numthreads; j++)
487         {
488                 thread = &dpsoftrast.threads[j];
489                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
490                         if (thread->texbound[i])
491                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
492         }
493 }
494
495 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
496 {
497         int w;
498         int h;
499         int d;
500         int size;
501         int s;
502         int texnum;
503         int mipmaps;
504         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
505         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
506         DPSOFTRAST_Texture *texture;
507         if (width*height*depth < 1)
508         {
509                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
510                 return 0;
511         }
512         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
513         {
514                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
515                 return 0;
516         }
517         switch(texformat)
518         {
519         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
520         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
521         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
522                 break;
523         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
524                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
525                 {
526                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
527                         return 0;
528                 }
529                 if (depth != 1)
530                 {
531                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
532                         return 0;
533                 }
534                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
535                 {
536                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
537                         return 0;
538                 }
539                 break;
540         }
541         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
542         {
543                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
544                 return 0;
545         }
546         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
547         {
548                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
549                 return 0;
550         }
551         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
552         {
553                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
554                 return 0;
555         }
556         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
557         {
558                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
559                 return 0;
560         }
561         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
562         {
563                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
564                 return 0;
565         }
566         // find first empty slot in texture array
567         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
568                 if (!dpsoftrast.texture[texnum].bytes)
569                         break;
570         dpsoftrast.texture_firstfree = texnum + 1;
571         if (dpsoftrast.texture_max <= texnum)
572                 DPSOFTRAST_Texture_Grow();
573         if (dpsoftrast.texture_end <= texnum)
574                 dpsoftrast.texture_end = texnum + 1;
575         texture = &dpsoftrast.texture[texnum];
576         memset(texture, 0, sizeof(*texture));
577         texture->flags = flags;
578         texture->width = width;
579         texture->height = height;
580         texture->depth = depth;
581         texture->sides = sides;
582         texture->binds = 0;
583         w = width;
584         h = height;
585         d = depth;
586         size = 0;
587         mipmaps = 0;
588         w = width;
589         h = height;
590         d = depth;
591         for (;;)
592         {
593                 s = w * h * d * sides * 4;
594                 texture->mipmap[mipmaps][0] = size;
595                 texture->mipmap[mipmaps][1] = s;
596                 texture->mipmap[mipmaps][2] = w;
597                 texture->mipmap[mipmaps][3] = h;
598                 texture->mipmap[mipmaps][4] = d;
599                 size += s;
600                 mipmaps++;
601                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
602                         break;
603                 if (w > 1) w >>= 1;
604                 if (h > 1) h >>= 1;
605                 if (d > 1) d >>= 1;
606         }
607         texture->mipmaps = mipmaps;
608         texture->size = size;
609
610         // allocate the pixels now
611         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
612
613         return texnum;
614 }
615 void DPSOFTRAST_Texture_Free(int index)
616 {
617         DPSOFTRAST_Texture *texture;
618         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
619         if (texture->binds)
620                 DPSOFTRAST_Flush();
621         if (texture->bytes)
622                 MM_FREE(texture->bytes);
623         texture->bytes = NULL;
624         memset(texture, 0, sizeof(*texture));
625         // adjust the free range and used range
626         if (dpsoftrast.texture_firstfree > index)
627                 dpsoftrast.texture_firstfree = index;
628         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
629                 dpsoftrast.texture_end--;
630 }
631 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
632 {
633         int i, x, y, z, w, layer0, layer1, row0, row1;
634         unsigned char *o, *i0, *i1, *i2, *i3;
635         DPSOFTRAST_Texture *texture;
636         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
637         if (texture->mipmaps <= 1)
638                 return;
639         for (i = 1;i < texture->mipmaps;i++)
640         {
641                 for (z = 0;z < texture->mipmap[i][4];z++)
642                 {
643                         layer0 = z*2;
644                         layer1 = z*2+1;
645                         if (layer1 >= texture->mipmap[i-1][4])
646                                 layer1 = texture->mipmap[i-1][4]-1;
647                         for (y = 0;y < texture->mipmap[i][3];y++)
648                         {
649                                 row0 = y*2;
650                                 row1 = y*2+1;
651                                 if (row1 >= texture->mipmap[i-1][3])
652                                         row1 = texture->mipmap[i-1][3]-1;
653                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
654                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
655                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
656                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
657                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
658                                 w = texture->mipmap[i][2];
659                                 if (layer1 > layer0)
660                                 {
661                                         if (texture->mipmap[i-1][2] > 1)
662                                         {
663                                                 // average 3D texture
664                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
665                                                 {
666                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
667                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
668                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
669                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
670                                                 }
671                                         }
672                                         else
673                                         {
674                                                 // average 3D mipmap with parent width == 1
675                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
676                                                 {
677                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
678                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
679                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
680                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
681                                                 }
682                                         }
683                                 }
684                                 else
685                                 {
686                                         if (texture->mipmap[i-1][2] > 1)
687                                         {
688                                                 // average 2D texture (common case)
689                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
690                                                 {
691                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
692                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
693                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
694                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
695                                                 }
696                                         }
697                                         else
698                                         {
699                                                 // 2D texture with parent width == 1
700                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
701                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
702                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
703                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
704                                         }
705                                 }
706                         }
707                 }
708         }
709 }
710 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
711 {
712         DPSOFTRAST_Texture *texture;
713         unsigned char *dst;
714         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
715         if (texture->binds)
716                 DPSOFTRAST_Flush();
717         if (pixels)
718         {
719                 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
720                 while (blockheight > 0)
721                 {
722                         memcpy(dst, pixels, blockwidth * 4);
723                         pixels += blockwidth * 4;
724                         dst += texture->mipmap[0][2] * 4;
725                         blockheight--;
726                 }
727         }
728         DPSOFTRAST_Texture_CalculateMipmaps(index);
729 }
730 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
731 {
732         DPSOFTRAST_Texture *texture;
733         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
734         if (texture->binds)
735                 DPSOFTRAST_Flush();
736         if (pixels)
737                 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
738         DPSOFTRAST_Texture_CalculateMipmaps(index);
739 }
740 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
741 {
742         DPSOFTRAST_Texture *texture;
743         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
744         return texture->mipmap[mip][2];
745 }
746 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
747 {
748         DPSOFTRAST_Texture *texture;
749         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
750         return texture->mipmap[mip][3];
751 }
752 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
753 {
754         DPSOFTRAST_Texture *texture;
755         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
756         return texture->mipmap[mip][4];
757 }
758 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
759 {
760         DPSOFTRAST_Texture *texture;
761         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
762         if (texture->binds)
763                 DPSOFTRAST_Flush();
764         return texture->bytes + texture->mipmap[mip][0];
765 }
766 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
767 {
768         DPSOFTRAST_Texture *texture;
769         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
770         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
771         {
772                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
773                 return;
774         }
775         if (texture->binds)
776                 DPSOFTRAST_Flush();
777         texture->filter = filter;
778 }
779
780 static void DPSOFTRAST_Draw_FlushThreads(void);
781
782 static void DPSOFTRAST_Draw_SyncCommands(void)
783 {
784         if(dpsoftrast.usethreads) MEMORY_BARRIER;
785         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
786 }
787
788 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
789 {
790         DPSOFTRAST_State_Thread *thread;
791         int i;
792         int freecommand = dpsoftrast.commandpool.freecommand;
793         int usedcommands = dpsoftrast.commandpool.usedcommands;
794         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
795                 return;
796         DPSOFTRAST_Draw_SyncCommands();
797         for(;;)
798         {
799                 int waitindex = -1;
800                 int commandoffset;
801                 usedcommands = 0;
802                 for (i = 0; i < dpsoftrast.numthreads; i++)
803                 {
804                         thread = &dpsoftrast.threads[i]; 
805                         commandoffset = freecommand - thread->commandoffset;
806                         if (commandoffset < 0)
807                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
808                         if (commandoffset > usedcommands)
809                         {
810                                 waitindex = i;
811                                 usedcommands = commandoffset;
812                         }
813                 }
814                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
815                         break;
816                 thread = &dpsoftrast.threads[waitindex];
817                 Thread_LockMutex(thread->drawmutex);
818                 if (thread->commandoffset != dpsoftrast.drawcommand)
819                 {
820                         thread->waiting = true;
821                         if (thread->starving) Thread_CondSignal(thread->drawcond);
822                         Thread_CondWait(thread->waitcond, thread->drawmutex);
823                         thread->waiting = false;
824                 }
825                 Thread_UnlockMutex(thread->drawmutex);
826         }
827         dpsoftrast.commandpool.usedcommands = usedcommands;
828 }
829
830 #define DPSOFTRAST_ALIGNCOMMAND(size) \
831         ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
832 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
833         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
834
835 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
836 {
837         DPSOFTRAST_Command *command;
838         int freecommand = dpsoftrast.commandpool.freecommand;
839         int usedcommands = dpsoftrast.commandpool.usedcommands;
840         int extra = sizeof(DPSOFTRAST_Command);
841         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
842                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
843         if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
844         {
845                 if (dpsoftrast.usethreads)
846                         DPSOFTRAST_Draw_FreeCommandPool(size + extra);
847                 else
848                         DPSOFTRAST_Draw_FlushThreads();
849                 freecommand = dpsoftrast.commandpool.freecommand;
850                 usedcommands = dpsoftrast.commandpool.usedcommands;
851         }
852         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
853         {
854                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
855                 command->opcode = DPSOFTRAST_OPCODE_Reset;
856                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
857                 freecommand = 0;
858         }
859         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
860         command->opcode = opcode;
861         command->commandsize = size;
862         freecommand += size;
863         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
864                 freecommand = 0;
865         dpsoftrast.commandpool.freecommand = freecommand;
866         dpsoftrast.commandpool.usedcommands = usedcommands + size;
867         return command;
868 }
869
870 static void DPSOFTRAST_UndoCommand(int size)
871 {
872         int freecommand = dpsoftrast.commandpool.freecommand;
873         int usedcommands = dpsoftrast.commandpool.usedcommands;
874         freecommand -= size;
875         if (freecommand < 0)
876                 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
877         usedcommands -= size;
878         dpsoftrast.commandpool.freecommand = freecommand;
879         dpsoftrast.commandpool.usedcommands = usedcommands;
880 }
881                 
882 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
883 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
884 {
885         thread->viewport[0] = command->x;
886         thread->viewport[1] = command->y;
887         thread->viewport[2] = command->width;
888         thread->viewport[3] = command->height;
889         thread->validate |= DPSOFTRAST_VALIDATE_FB;
890 }
891 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
892 {
893         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
894         command->x = x;
895         command->y = y;
896         command->width = width;
897         command->height = height;
898
899         dpsoftrast.viewport[0] = x;
900         dpsoftrast.viewport[1] = y;
901         dpsoftrast.viewport[2] = width;
902         dpsoftrast.viewport[3] = height;
903         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
904 }
905
906 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
907 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
908 {
909         int i, x1, y1, x2, y2, w, h, x, y;
910         int miny1, maxy1, miny2, maxy2;
911         int bandy;
912         unsigned int *p;
913         unsigned int c;
914         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
915         miny1 = thread->miny1;
916         maxy1 = thread->maxy1;
917         miny2 = thread->miny2;
918         maxy2 = thread->maxy2;
919         x1 = thread->fb_scissor[0];
920         y1 = thread->fb_scissor[1];
921         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
922         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
923         if (y1 < miny1) y1 = miny1;
924         if (y2 > maxy2) y2 = maxy2;
925         w = x2 - x1;
926         h = y2 - y1;
927         if (w < 1 || h < 1)
928                 return;
929         // FIXME: honor fb_colormask?
930         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
931         for (i = 0;i < 4;i++)
932         {
933                 if (!dpsoftrast.fb_colorpixels[i])
934                         continue;
935                 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
936                 for (;y < bandy;y++)
937                 {
938                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
939                         for (x = x1;x < x2;x++)
940                                 p[x] = c;
941                 }
942         }
943 }
944 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
945 {
946         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
947         command->r = r;
948         command->g = g;
949         command->b = b;
950         command->a = a;
951 }
952
953 DEFCOMMAND(3, ClearDepth, float depth;)
954 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
955 {
956         int x1, y1, x2, y2, w, h, x, y;
957         int miny1, maxy1, miny2, maxy2;
958         int bandy;
959         unsigned int *p;
960         unsigned int c;
961         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
962         miny1 = thread->miny1;
963         maxy1 = thread->maxy1;
964         miny2 = thread->miny2;
965         maxy2 = thread->maxy2;
966         x1 = thread->fb_scissor[0];
967         y1 = thread->fb_scissor[1];
968         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
969         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
970         if (y1 < miny1) y1 = miny1;
971         if (y2 > maxy2) y2 = maxy2;
972         w = x2 - x1;
973         h = y2 - y1;
974         if (w < 1 || h < 1)
975                 return;
976         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
977         for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
978         for (;y < bandy;y++)
979         {
980                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
981                 for (x = x1;x < x2;x++)
982                         p[x] = c;
983         }
984 }
985 void DPSOFTRAST_ClearDepth(float d)
986 {
987         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
988         command->depth = d;
989 }
990
991 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
992 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
993 {
994         thread->colormask[0] = command->r != 0;
995         thread->colormask[1] = command->g != 0;
996         thread->colormask[2] = command->b != 0;
997         thread->colormask[3] = command->a != 0;
998         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
999 }
1000 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1001 {
1002         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1003         command->r = r;
1004         command->g = g;
1005         command->b = b;
1006         command->a = a;
1007 }
1008
1009 DEFCOMMAND(5, DepthTest, int enable;)
1010 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1011 {
1012         thread->depthtest = command->enable;
1013         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1014 }
1015 void DPSOFTRAST_DepthTest(int enable)
1016 {
1017         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1018         command->enable = enable;
1019 }
1020
1021 DEFCOMMAND(6, ScissorTest, int enable;)
1022 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1023 {
1024         thread->scissortest = command->enable;
1025         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1026 }
1027 void DPSOFTRAST_ScissorTest(int enable)
1028 {
1029         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1030         command->enable = enable;
1031 }
1032
1033 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1034 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1035 {
1036         thread->scissor[0] = command->x;
1037         thread->scissor[1] = command->y;
1038         thread->scissor[2] = command->width;
1039         thread->scissor[3] = command->height;
1040         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1041 }
1042 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1043 {
1044         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1045         command->x = x;
1046         command->y = y;
1047         command->width = width;
1048         command->height = height;
1049 }
1050
1051 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1052 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1053 {
1054         thread->blendfunc[0] = command->sfactor;
1055         thread->blendfunc[1] = command->dfactor;
1056         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1057 }
1058 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1059 {
1060         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1061         command->sfactor = sfactor;
1062         command->dfactor = dfactor;
1063 }
1064
1065 DEFCOMMAND(9, BlendSubtract, int enable;)
1066 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1067 {
1068         thread->blendsubtract = command->enable;
1069         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1070 }
1071 void DPSOFTRAST_BlendSubtract(int enable)
1072 {
1073         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1074         command->enable = enable;
1075 }
1076
1077 DEFCOMMAND(10, DepthMask, int enable;)
1078 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1079 {
1080         thread->depthmask = command->enable;
1081 }
1082 void DPSOFTRAST_DepthMask(int enable)
1083 {
1084         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1085         command->enable = enable;
1086 }
1087
1088 DEFCOMMAND(11, DepthFunc, int func;)
1089 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1090 {
1091         thread->depthfunc = command->func;
1092 }
1093 void DPSOFTRAST_DepthFunc(int func)
1094 {
1095         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1096         command->func = func;
1097 }
1098
1099 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1100 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1101 {
1102         thread->depthrange[0] = command->nearval;
1103         thread->depthrange[1] = command->farval;
1104 }
1105 void DPSOFTRAST_DepthRange(float nearval, float farval)
1106 {
1107         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1108         command->nearval = nearval;
1109         command->farval = farval;
1110 }
1111
1112 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1113 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1114 {
1115         thread->polygonoffset[0] = command->alongnormal;
1116         thread->polygonoffset[1] = command->intoview;
1117 }
1118 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1119 {
1120         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1121         command->alongnormal = alongnormal;
1122         command->intoview = intoview;
1123 }
1124
1125 DEFCOMMAND(14, CullFace, int mode;)
1126 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1127 {
1128         thread->cullface = command->mode;
1129 }
1130 void DPSOFTRAST_CullFace(int mode)
1131 {
1132         DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1133         command->mode = mode;
1134 }
1135
1136 DEFCOMMAND(15, AlphaTest, int enable;)
1137 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1138 {
1139         thread->alphatest = command->enable;
1140 }
1141 void DPSOFTRAST_AlphaTest(int enable)
1142 {
1143         DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1144         command->enable = enable;
1145 }
1146
1147 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1148 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1149 {
1150         thread->alphafunc = command->func;
1151         thread->alphavalue = command->ref;
1152 }
1153 void DPSOFTRAST_AlphaFunc(int func, float ref)
1154 {
1155         DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1156         command->func = func;
1157         command->ref = ref;
1158 }
1159
1160 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1161 {
1162         dpsoftrast.color[0] = r;
1163         dpsoftrast.color[1] = g;
1164         dpsoftrast.color[2] = b;
1165         dpsoftrast.color[3] = a;
1166 }
1167
1168 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1169 {
1170         int outstride = blockwidth * 4;
1171         int instride = dpsoftrast.fb_width * 4;
1172         int bx1 = blockx;
1173         int by1 = blocky;
1174         int bx2 = blockx + blockwidth;
1175         int by2 = blocky + blockheight;
1176         int bw;
1177         int x;
1178         int y;
1179         unsigned char *inpixels;
1180         unsigned char *b;
1181         unsigned char *o;
1182         DPSOFTRAST_Flush();
1183         if (bx1 < 0) bx1 = 0;
1184         if (by1 < 0) by1 = 0;
1185         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1186         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1187         bw = bx2 - bx1;
1188         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1189         if (dpsoftrast.bigendian)
1190         {
1191                 for (y = by1;y < by2;y++)
1192                 {
1193                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1194                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1195                         for (x = bx1;x < bx2;x++)
1196                         {
1197                                 o[0] = b[3];
1198                                 o[1] = b[2];
1199                                 o[2] = b[1];
1200                                 o[3] = b[0];
1201                                 o += 4;
1202                                 b += 4;
1203                         }
1204                 }
1205         }
1206         else
1207         {
1208                 for (y = by1;y < by2;y++)
1209                 {
1210                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1211                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1212                         memcpy(o, b, bw*4);
1213                 }
1214         }
1215
1216 }
1217 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1218 {
1219         int tx1 = tx;
1220         int ty1 = ty;
1221         int tx2 = tx + width;
1222         int ty2 = ty + height;
1223         int sx1 = sx;
1224         int sy1 = sy;
1225         int sx2 = sx + width;
1226         int sy2 = sy + height;
1227         int swidth;
1228         int sheight;
1229         int twidth;
1230         int theight;
1231         int sw;
1232         int sh;
1233         int tw;
1234         int th;
1235         int y;
1236         unsigned int *spixels;
1237         unsigned int *tpixels;
1238         DPSOFTRAST_Texture *texture;
1239         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1240         if (mip < 0 || mip >= texture->mipmaps) return;
1241         DPSOFTRAST_Flush();
1242         spixels = dpsoftrast.fb_colorpixels[0];
1243         swidth = dpsoftrast.fb_width;
1244         sheight = dpsoftrast.fb_height;
1245         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1246         twidth = texture->mipmap[mip][2];
1247         theight = texture->mipmap[mip][3];
1248         if (tx1 < 0) tx1 = 0;
1249         if (ty1 < 0) ty1 = 0;
1250         if (tx2 > twidth) tx2 = twidth;
1251         if (ty2 > theight) ty2 = theight;
1252         if (sx1 < 0) sx1 = 0;
1253         if (sy1 < 0) sy1 = 0;
1254         if (sx2 > swidth) sx2 = swidth;
1255         if (sy2 > sheight) sy2 = sheight;
1256         tw = tx2 - tx1;
1257         th = ty2 - ty1;
1258         sw = sx2 - sx1;
1259         sh = sy2 - sy1;
1260         if (tw > sw) tw = sw;
1261         if (th > sh) th = sh;
1262         if (tw < 1 || th < 1)
1263                 return;
1264         sy1 = sheight - 1 - sy1;
1265         for (y = 0;y < th;y++)
1266                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1267         if (texture->mipmaps > 1)
1268                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1269 }
1270
1271 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1272 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1273 {
1274         if (thread->texbound[command->unitnum])
1275                 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1276         thread->texbound[command->unitnum] = command->texture;
1277 }
1278 void DPSOFTRAST_SetTexture(int unitnum, int index)
1279 {
1280         DPSOFTRAST_Command_SetTexture *command;
1281         DPSOFTRAST_Texture *texture;
1282         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1283         {
1284                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1285                 return;
1286         }
1287         texture = DPSOFTRAST_Texture_GetByIndex(index);
1288         if (index && !texture)
1289         {
1290                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1291                 return;
1292         }
1293
1294         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1295         command->unitnum = unitnum;
1296         command->texture = texture;
1297
1298         dpsoftrast.texbound[unitnum] = texture;
1299         ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1300 }
1301
1302 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1303 {
1304         dpsoftrast.pointer_vertex3f = vertex3f;
1305         dpsoftrast.stride_vertex = stride;
1306 }
1307 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1308 {
1309         dpsoftrast.pointer_color4f = color4f;
1310         dpsoftrast.pointer_color4ub = NULL;
1311         dpsoftrast.stride_color = stride;
1312 }
1313 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1314 {
1315         dpsoftrast.pointer_color4f = NULL;
1316         dpsoftrast.pointer_color4ub = color4ub;
1317         dpsoftrast.stride_color = stride;
1318 }
1319 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1320 {
1321         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1322         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1323         dpsoftrast.stride_texcoord[unitnum] = stride;
1324 }
1325
1326 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1327 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1328 {
1329         thread->shader_mode = command->mode;
1330         thread->shader_permutation = command->permutation;
1331         thread->shader_exactspecularmath = command->exactspecularmath;
1332 }
1333 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1334 {
1335         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1336         command->mode = mode;
1337         command->permutation = permutation;
1338         command->exactspecularmath = exactspecularmath;
1339
1340         dpsoftrast.shader_mode = mode;
1341         dpsoftrast.shader_permutation = permutation;
1342         dpsoftrast.shader_exactspecularmath = exactspecularmath;
1343 }
1344
1345 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1346 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1347 {
1348         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1349 }
1350 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1351 {
1352         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1353         command->index = index;
1354         command->val[0] = v0;
1355         command->val[1] = v1;
1356         command->val[2] = v2;
1357         command->val[3] = v3;
1358
1359         dpsoftrast.uniform4f[index*4+0] = v0;
1360         dpsoftrast.uniform4f[index*4+1] = v1;
1361         dpsoftrast.uniform4f[index*4+2] = v2;
1362         dpsoftrast.uniform4f[index*4+3] = v3;
1363 }
1364 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1365 {
1366         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1367         command->index = index;
1368         memcpy(command->val, v, sizeof(command->val));
1369
1370         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1371 }
1372
1373 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1374 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1375 {
1376         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1377 }
1378 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1379 {
1380 #ifdef SSE_POSSIBLE
1381         int i, index;
1382         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1383         {
1384                 __m128 m0, m1, m2, m3;
1385                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1386                 command->index = (DPSOFTRAST_UNIFORM)index;
1387                 if (((size_t)v)&(ALIGN_SIZE-1))
1388                 {
1389                         m0 = _mm_loadu_ps(v);
1390                         m1 = _mm_loadu_ps(v+4);
1391                         m2 = _mm_loadu_ps(v+8);
1392                         m3 = _mm_loadu_ps(v+12);
1393                 }
1394                 else
1395                 {
1396                         m0 = _mm_load_ps(v);
1397                         m1 = _mm_load_ps(v+4);
1398                         m2 = _mm_load_ps(v+8);
1399                         m3 = _mm_load_ps(v+12);
1400                 }
1401                 if (transpose)
1402                 {
1403                         __m128 t0, t1, t2, t3;
1404                         t0 = _mm_unpacklo_ps(m0, m1);
1405                         t1 = _mm_unpacklo_ps(m2, m3);
1406                         t2 = _mm_unpackhi_ps(m0, m1);
1407                         t3 = _mm_unpackhi_ps(m2, m3);
1408                         m0 = _mm_movelh_ps(t0, t1);
1409                         m1 = _mm_movehl_ps(t1, t0);
1410                         m2 = _mm_movelh_ps(t2, t3);
1411                         m3 = _mm_movehl_ps(t3, t2);                     
1412                 }
1413                 _mm_store_ps(command->val, m0);
1414                 _mm_store_ps(command->val+4, m1);
1415                 _mm_store_ps(command->val+8, m2);
1416                 _mm_store_ps(command->val+12, m3);
1417                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1418                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1419                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1420                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1421         }
1422 #endif
1423 }
1424
1425 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1426 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1427 {
1428         thread->uniform1i[command->index] = command->val;
1429 }
1430 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1431 {
1432         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1433         command->index = index;
1434         command->val = i0;
1435
1436         dpsoftrast.uniform1i[command->index] = i0;
1437 }
1438
1439 DEFCOMMAND(24, ClipPlane, float clipplane[4];)
1440 static void DPSOFTRAST_Interpret_ClipPlane(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClipPlane *command)
1441 {
1442         memcpy(thread->clipplane, command->clipplane, 4*sizeof(float));
1443 }
1444 void DPSOFTRAST_ClipPlane(float x, float y, float z, float w)
1445 {
1446         DPSOFTRAST_Command_ClipPlane *command = DPSOFTRAST_ALLOCATECOMMAND(ClipPlane);
1447         x /= dpsoftrast.fb_viewportscale[1];
1448         y /= dpsoftrast.fb_viewportscale[2];
1449         z /= dpsoftrast.fb_viewportscale[3];
1450         w /= dpsoftrast.fb_viewportscale[0];
1451         w -= dpsoftrast.fb_viewportcenter[1]*x + dpsoftrast.fb_viewportcenter[2]*y + dpsoftrast.fb_viewportcenter[3]*z + dpsoftrast.fb_viewportcenter[0]*w; 
1452         command->clipplane[0] = x;
1453         command->clipplane[1] = y;
1454         command->clipplane[2] = z;
1455         command->clipplane[3] = w;
1456 }
1457
1458 #ifdef SSE_POSSIBLE
1459 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1460 {
1461         float *end = dst + size*4;
1462         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1463         {
1464                 while (dst < end)
1465                 {
1466                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1467                         dst += 4;
1468                         src += stride;
1469                 }
1470         }
1471         else
1472         {
1473                 while (dst < end)
1474                 {
1475                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1476                         dst += 4;
1477                         src += stride;
1478                 }
1479         }
1480 }
1481
1482 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1483 {
1484         float *end = dst + size*4;
1485         if (stride == sizeof(float[3]))
1486         {
1487                 float *end4 = dst + (size&~3)*4;        
1488                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1489                 {
1490                         while (dst < end4)
1491                         {
1492                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1493                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1494                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1495                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1496                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1497                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1498                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1499                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1500                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1501                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1502                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1503                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1504                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1505                                 dst += 16;
1506                                 src += 4*sizeof(float[3]);
1507                         }
1508                 }
1509                 else
1510                 {
1511                         while (dst < end4)
1512                         {
1513                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1514                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1515                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1516                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1517                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1518                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1519                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1520                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1521                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1522                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1523                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1524                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1525                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1526                                 dst += 16;
1527                                 src += 4*sizeof(float[3]);
1528                         }
1529                 }
1530         }
1531         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1532         {
1533                 while (dst < end)
1534                 {
1535                         __m128 v = _mm_loadu_ps((const float *)src);
1536                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1537                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1538                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1539                         _mm_store_ps(dst, v);
1540                         dst += 4;
1541                         src += stride;
1542                 }
1543         }
1544         else
1545         {
1546                 while (dst < end)
1547                 {
1548                         __m128 v = _mm_load_ps((const float *)src);
1549                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1550                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1551                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1552                         _mm_store_ps(dst, v);
1553                         dst += 4;
1554                         src += stride;
1555                 }
1556         }
1557 }
1558
1559 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1560 {
1561         float *end = dst + size*4;
1562         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1563         if (stride == sizeof(float[2]))
1564         {
1565                 float *end2 = dst + (size&~1)*4;
1566                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1567                 {
1568                         while (dst < end2)
1569                         {
1570                                 __m128 v = _mm_loadu_ps((const float *)src);
1571                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1572                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1573                                 dst += 8;
1574                                 src += 2*sizeof(float[2]);
1575                         }
1576                 }
1577                 else
1578                 {
1579                         while (dst < end2)
1580                         {
1581                                 __m128 v = _mm_load_ps((const float *)src);
1582                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1583                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1584                                 dst += 8;
1585                                 src += 2*sizeof(float[2]);
1586                         }
1587                 }
1588         }
1589         while (dst < end)
1590         {
1591                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1592                 dst += 4;
1593                 src += stride;
1594         }
1595 }
1596
1597 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1598 {
1599         float *end = dst + size*4;
1600         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1601         if (stride == sizeof(unsigned char[4]))
1602         {
1603                 float *end4 = dst + (size&~3)*4;
1604                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1605                 {
1606                         while (dst < end4)
1607                         {
1608                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1609                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1610                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1611                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1612                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1613                                 dst += 16;
1614                                 src += 4*sizeof(unsigned char[4]);
1615                         }
1616                 }
1617                 else
1618                 {
1619                         while (dst < end4)
1620                         {
1621                                 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1622                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1623                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1624                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1625                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1626                                 dst += 16;
1627                                 src += 4*sizeof(unsigned char[4]);
1628                         }
1629                 }
1630         }
1631         while (dst < end)
1632         {
1633                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1634                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1635                 dst += 4;
1636                 src += stride;
1637         }
1638 }
1639
1640 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1641 {
1642         float *end = dst + 4*size;
1643         __m128 v = _mm_loadu_ps(src);
1644         while (dst < end)
1645         {
1646                 _mm_store_ps(dst, v);
1647                 dst += 4;
1648         }
1649 }
1650 #endif
1651
1652 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1653 {
1654 #ifdef SSE_POSSIBLE
1655         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1656         __m128 m0, m1, m2, m3;
1657         float *end;
1658         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1659         {
1660                 // fast case for identity matrix
1661                 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1662                 return;
1663         }
1664         end = out4f + numitems*4;
1665         m0 = _mm_loadu_ps(inmatrix16f);
1666         m1 = _mm_loadu_ps(inmatrix16f + 4);
1667         m2 = _mm_loadu_ps(inmatrix16f + 8);
1668         m3 = _mm_loadu_ps(inmatrix16f + 12);
1669         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1670         {
1671                 while (out4f < end)
1672                 {
1673                         __m128 v = _mm_loadu_ps(in4f);
1674                         _mm_store_ps(out4f,
1675                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1676                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1677                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1678                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1679                         out4f += 4;
1680                         in4f += 4;
1681                 }
1682         }
1683         else
1684         {
1685                 while (out4f < end)
1686                 {
1687                         __m128 v = _mm_load_ps(in4f);
1688                         _mm_store_ps(out4f,
1689                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1690                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1691                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1692                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1693                         out4f += 4;
1694                         in4f += 4;
1695                 }
1696         }
1697 #endif
1698 }
1699
1700 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1701 {
1702         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1703 }
1704
1705 #ifdef SSE_POSSIBLE
1706 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1707 { \
1708         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1709         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1710         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1711         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1712 }
1713
1714 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1715 { \
1716         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1717         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1718         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1719         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1720 }
1721
1722 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1723 { \
1724         __m128 p = (in); \
1725         out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1726                                                   _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1727                                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1728                                                                                         _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1729 }
1730
1731 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1732 {
1733         int clipmask = 0xFF;
1734         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1735         __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1736         __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1737         __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1738         m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1739         m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1740         m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1741         m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1742         #define BBFRONT(k, pos) \
1743         { \
1744                 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1745                 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1746                 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1747                 { \
1748                         __m128 proj; \
1749                         clipmask &= ~(1<<k); \
1750                         proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1751                         minproj = _mm_min_ss(minproj, proj); \
1752                         maxproj = _mm_max_ss(maxproj, proj); \
1753                 } \
1754         }
1755         BBFRONT(0, minpos); 
1756         BBFRONT(1, _mm_move_ss(minpos, maxpos)); 
1757         BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1758         BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1759         BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1760         BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1761         BBFRONT(6, _mm_move_ss(maxpos, minpos)); 
1762         BBFRONT(7, maxpos);
1763         #define BBCLIP(k) \
1764         { \
1765                 if (clipmask&(1<<k)) \
1766                 { \
1767                         if (!(clipmask&(1<<(k^1)))) \
1768                         { \
1769                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1770                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1771                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1772                                 minproj = _mm_min_ss(minproj, proj); \
1773                                 maxproj = _mm_max_ss(maxproj, proj); \
1774                         } \
1775                         if (!(clipmask&(1<<(k^2)))) \
1776                         { \
1777                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1778                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1779                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1780                                 minproj = _mm_min_ss(minproj, proj); \
1781                                 maxproj = _mm_max_ss(maxproj, proj); \
1782                         } \
1783                         if (!(clipmask&(1<<(k^4)))) \
1784                         { \
1785                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1786                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1787                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1788                                 minproj = _mm_min_ss(minproj, proj); \
1789                                 maxproj = _mm_max_ss(maxproj, proj); \
1790                         } \
1791                 } \
1792         }
1793         BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1794         viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1795         viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1796         minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1797         maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1798         minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1799         maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1800         *starty = _mm_cvttss_si32(maxproj);
1801         *endy = _mm_cvttss_si32(minproj)+1;
1802         return clipmask;
1803 }
1804         
1805 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1806 {
1807         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1808         float *end = out4f + numitems*4;
1809         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1810         __m128 minpos, maxpos;
1811         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1812         {
1813                 minpos = maxpos = _mm_loadu_ps(in4f);
1814                 while (out4f < end)
1815                 {
1816                         __m128 v = _mm_loadu_ps(in4f);
1817                         minpos = _mm_min_ps(minpos, v);
1818                         maxpos = _mm_max_ps(maxpos, v);
1819                         _mm_store_ps(out4f, v);
1820                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1821                         _mm_store_ps(screen4f, v);
1822                         in4f += 4;
1823                         out4f += 4;
1824                         screen4f += 4;
1825                 }
1826         }
1827         else
1828         {
1829                 minpos = maxpos = _mm_load_ps(in4f);
1830                 while (out4f < end)
1831                 {
1832                         __m128 v = _mm_load_ps(in4f);
1833                         minpos = _mm_min_ps(minpos, v);
1834                         maxpos = _mm_max_ps(maxpos, v);
1835                         _mm_store_ps(out4f, v);
1836                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1837                         _mm_store_ps(screen4f, v);
1838                         in4f += 4;
1839                         out4f += 4;
1840                         screen4f += 4;
1841                 }
1842         }
1843         if (starty && endy) 
1844         {
1845                 ALIGN(float minposf[4]);
1846                 ALIGN(float maxposf[4]);
1847                 _mm_store_ps(minposf, minpos);
1848                 _mm_store_ps(maxposf, maxpos);
1849                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1850         }
1851         return 0;
1852 }
1853
1854 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1855 {
1856         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1857         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1858         float *end;
1859         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1860                 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1861         end = out4f + numitems*4;
1862         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1863         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1864         m0 = _mm_loadu_ps(inmatrix16f);
1865         m1 = _mm_loadu_ps(inmatrix16f + 4);
1866         m2 = _mm_loadu_ps(inmatrix16f + 8);
1867         m3 = _mm_loadu_ps(inmatrix16f + 12);
1868         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1869         {
1870                 minpos = maxpos = _mm_loadu_ps(in4f);
1871                 while (out4f < end)
1872                 {
1873                         __m128 v = _mm_loadu_ps(in4f);
1874                         minpos = _mm_min_ps(minpos, v);
1875                         maxpos = _mm_max_ps(maxpos, v);
1876                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1877                         _mm_store_ps(out4f, v);
1878                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1879                         _mm_store_ps(screen4f, v);
1880                         in4f += 4;
1881                         out4f += 4;
1882                         screen4f += 4;
1883                 }
1884         }
1885         else
1886         {
1887                 minpos = maxpos = _mm_load_ps(in4f);
1888                 while (out4f < end)
1889                 {
1890                         __m128 v = _mm_load_ps(in4f);
1891                         minpos = _mm_min_ps(minpos, v);
1892                         maxpos = _mm_max_ps(maxpos, v);
1893                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1894                         _mm_store_ps(out4f, v);
1895                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1896                         _mm_store_ps(screen4f, v);
1897                         in4f += 4;
1898                         out4f += 4;
1899                         screen4f += 4;
1900                 }
1901         }
1902         if (starty && endy) 
1903         {
1904                 ALIGN(float minposf[4]);
1905                 ALIGN(float maxposf[4]);
1906                 _mm_store_ps(minposf, minpos);
1907                 _mm_store_ps(maxposf, maxpos);
1908                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f); 
1909         }
1910         return 0;
1911 }
1912 #endif
1913
1914 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1915 {
1916 #ifdef SSE_POSSIBLE
1917         float *outf = dpsoftrast.post_array4f[outarray];
1918         const unsigned char *inb;
1919         int firstvertex = dpsoftrast.firstvertex;
1920         int numvertices = dpsoftrast.numvertices;
1921         int stride;
1922         switch(inarray)
1923         {
1924         case DPSOFTRAST_ARRAY_POSITION:
1925                 stride = dpsoftrast.stride_vertex;
1926                 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1927                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1928                 break;
1929         case DPSOFTRAST_ARRAY_COLOR:
1930                 stride = dpsoftrast.stride_color;
1931                 if (dpsoftrast.pointer_color4f)
1932                 {
1933                         inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1934                         DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1935                 }
1936                 else if (dpsoftrast.pointer_color4ub)
1937                 {
1938                         stride = dpsoftrast.stride_color;
1939                         inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1940                         DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1941                 }
1942                 else
1943                 {
1944                         DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1945                 }
1946                 break;
1947         default:
1948                 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1949                 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1950                 {
1951                         inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1952                         switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1953                         {
1954                         case 2:
1955                                 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1956                                 break;
1957                         case 3:
1958                                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1959                                 break;
1960                         case 4:
1961                                 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1962                                 break;
1963                         }
1964                 }
1965                 break;
1966         }
1967         return outf;
1968 #else
1969         return NULL;
1970 #endif
1971 }
1972
1973 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1974 {
1975         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1976         DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1977         return data;
1978 }
1979
1980 #if 0
1981 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1982 {
1983 #ifdef SSE_POSSIBLE
1984         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1985         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1986         return data;
1987 #else
1988         return NULL;
1989 #endif
1990 }
1991 #endif
1992
1993 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1994 {
1995 #ifdef SSE_POSSIBLE
1996         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1997         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
1998         return data;
1999 #else
2000         return NULL;
2001 #endif
2002 }
2003
2004 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
2005 {
2006         int x;
2007         int startx = span->startx;
2008         int endx = span->endx;
2009         float wslope = triangle->w[0];
2010         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
2011         float endz = 1.0f / (w + wslope * startx);
2012         if (triangle->w[0] == 0)
2013         {
2014                 // LordHavoc: fast flat polygons (HUD/menu)
2015                 for (x = startx;x < endx;x++)
2016                         zf[x] = endz;
2017                 return;
2018         }
2019         for (x = startx;x < endx;)
2020         {
2021                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2022                 float z = endz, dz;
2023                 if (nextsub >= endx) nextsub = endsub = endx-1;
2024                 endz = 1.0f / (w + wslope * nextsub);
2025                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
2026                 for (; x <= endsub; x++, z += dz)
2027                         zf[x] = z;
2028         }
2029 }
2030
2031 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
2032 {
2033         int x;
2034         int startx = span->startx;
2035         int endx = span->endx;
2036         int d[4];
2037         float a, b;
2038         unsigned char * RESTRICT pixelmask = span->pixelmask;
2039         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2040         if (!pixel)
2041                 return;
2042         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2043         // handle alphatest now (this affects depth writes too)
2044         if (thread->alphatest)
2045                 for (x = startx;x < endx;x++)
2046                         if (in4f[x*4+3] < 0.5f)
2047                                 pixelmask[x] = false;
2048         // FIXME: this does not handle bigendian
2049         switch(thread->fb_blendmode)
2050         {
2051         case DPSOFTRAST_BLENDMODE_OPAQUE:
2052                 for (x = startx;x < endx;x++)
2053                 {
2054                         if (!pixelmask[x])
2055                                 continue;
2056                         d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
2057                         d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
2058                         d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
2059                         d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
2060                         pixel[x*4+0] = d[0];
2061                         pixel[x*4+1] = d[1];
2062                         pixel[x*4+2] = d[2];
2063                         pixel[x*4+3] = d[3];
2064                 }
2065                 break;
2066         case DPSOFTRAST_BLENDMODE_ALPHA:
2067                 for (x = startx;x < endx;x++)
2068                 {
2069                         if (!pixelmask[x])
2070                                 continue;
2071                         a = in4f[x*4+3] * 255.0f;
2072                         b = 1.0f - in4f[x*4+3];
2073                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2074                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2075                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2076                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2077                         pixel[x*4+0] = d[0];
2078                         pixel[x*4+1] = d[1];
2079                         pixel[x*4+2] = d[2];
2080                         pixel[x*4+3] = d[3];
2081                 }
2082                 break;
2083         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2084                 for (x = startx;x < endx;x++)
2085                 {
2086                         if (!pixelmask[x])
2087                                 continue;
2088                         a = in4f[x*4+3] * 255.0f;
2089                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2090                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2091                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2092                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2093                         pixel[x*4+0] = d[0];
2094                         pixel[x*4+1] = d[1];
2095                         pixel[x*4+2] = d[2];
2096                         pixel[x*4+3] = d[3];
2097                 }
2098                 break;
2099         case DPSOFTRAST_BLENDMODE_ADD:
2100                 for (x = startx;x < endx;x++)
2101                 {
2102                         if (!pixelmask[x])
2103                                 continue;
2104                         d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2105                         d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2106                         d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2107                         d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2108                         pixel[x*4+0] = d[0];
2109                         pixel[x*4+1] = d[1];
2110                         pixel[x*4+2] = d[2];
2111                         pixel[x*4+3] = d[3];
2112                 }
2113                 break;
2114         case DPSOFTRAST_BLENDMODE_INVMOD:
2115                 for (x = startx;x < endx;x++)
2116                 {
2117                         if (!pixelmask[x])
2118                                 continue;
2119                         d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2120                         d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2121                         d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2122                         d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2123                         pixel[x*4+0] = d[0];
2124                         pixel[x*4+1] = d[1];
2125                         pixel[x*4+2] = d[2];
2126                         pixel[x*4+3] = d[3];
2127                 }
2128                 break;
2129         case DPSOFTRAST_BLENDMODE_MUL:
2130                 for (x = startx;x < endx;x++)
2131                 {
2132                         if (!pixelmask[x])
2133                                 continue;
2134                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2135                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2136                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2137                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2138                         pixel[x*4+0] = d[0];
2139                         pixel[x*4+1] = d[1];
2140                         pixel[x*4+2] = d[2];
2141                         pixel[x*4+3] = d[3];
2142                 }
2143                 break;
2144         case DPSOFTRAST_BLENDMODE_MUL2:
2145                 for (x = startx;x < endx;x++)
2146                 {
2147                         if (!pixelmask[x])
2148                                 continue;
2149                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2150                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2151                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2152                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2153                         pixel[x*4+0] = d[0];
2154                         pixel[x*4+1] = d[1];
2155                         pixel[x*4+2] = d[2];
2156                         pixel[x*4+3] = d[3];
2157                 }
2158                 break;
2159         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2160                 for (x = startx;x < endx;x++)
2161                 {
2162                         if (!pixelmask[x])
2163                                 continue;
2164                         a = in4f[x*4+3] * -255.0f;
2165                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2166                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2167                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2168                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2169                         pixel[x*4+0] = d[0];
2170                         pixel[x*4+1] = d[1];
2171                         pixel[x*4+2] = d[2];
2172                         pixel[x*4+3] = d[3];
2173                 }
2174                 break;
2175         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2176                 for (x = startx;x < endx;x++)
2177                 {
2178                         if (!pixelmask[x])
2179                                 continue;
2180                         a = 255.0f;
2181                         b = 1.0f - in4f[x*4+3];
2182                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2183                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2184                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2185                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2186                         pixel[x*4+0] = d[0];
2187                         pixel[x*4+1] = d[1];
2188                         pixel[x*4+2] = d[2];
2189                         pixel[x*4+3] = d[3];
2190                 }
2191                 break;
2192         case DPSOFTRAST_BLENDMODE_INVADD:
2193                 for (x = startx;x < endx;x++)
2194                 {
2195                         if (!pixelmask[x])
2196                                 continue;
2197                         d[0] = (int)((255.0f-pixel[x*4+2])*in4f[x*4+0] + pixel[x*4+2]);if (d[0] > 255) d[0] = 255;
2198                         d[1] = (int)((255.0f-pixel[x*4+1])*in4f[x*4+1] + pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2199                         d[2] = (int)((255.0f-pixel[x*4+0])*in4f[x*4+2] + pixel[x*4+0]);if (d[2] > 255) d[2] = 255;
2200                         d[3] = (int)((255.0f-pixel[x*4+3])*in4f[x*4+3] + pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2201                         pixel[x*4+0] = d[0];
2202                         pixel[x*4+1] = d[1];
2203                         pixel[x*4+2] = d[2];
2204                         pixel[x*4+3] = d[3];
2205                 }
2206                 break;
2207         }
2208 }
2209
2210 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2211 {
2212 #ifdef SSE_POSSIBLE
2213         int x;
2214         int startx = span->startx;
2215         int endx = span->endx;
2216         int subx;
2217         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2218         unsigned char * RESTRICT pixelmask = span->pixelmask;
2219         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2220         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2221         if (!pixel)
2222                 return;
2223         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2224         pixeli += span->y * dpsoftrast.fb_width + span->x;
2225         // handle alphatest now (this affects depth writes too)
2226         if (thread->alphatest)
2227                 for (x = startx;x < endx;x++)
2228                         if (in4ub[x*4+3] < 128)
2229                                 pixelmask[x] = false;
2230         // LordHavoc: clear pixelmask for some pixels in alphablend cases, this
2231         // helps sprites, text and hud artwork
2232         switch(thread->fb_blendmode)
2233         {
2234         case DPSOFTRAST_BLENDMODE_ALPHA:
2235         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2236         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2237                 for (x = startx;x < endx;x++)
2238                         if (in4ub[x*4+3] < 1)
2239                                 pixelmask[x] = false;
2240                 break;
2241         case DPSOFTRAST_BLENDMODE_OPAQUE:
2242         case DPSOFTRAST_BLENDMODE_ADD:
2243         case DPSOFTRAST_BLENDMODE_INVMOD:
2244         case DPSOFTRAST_BLENDMODE_MUL:
2245         case DPSOFTRAST_BLENDMODE_MUL2:
2246         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2247         case DPSOFTRAST_BLENDMODE_INVADD:
2248                 break;
2249         }
2250         // put some special values at the end of the mask to ensure the loops end
2251         pixelmask[endx] = 1;
2252         pixelmask[endx+1] = 0;
2253         // LordHavoc: use a double loop to identify subspans, this helps the
2254         // optimized copy/blend loops to perform at their best, most triangles
2255         // have only one run of pixels, and do the search using wide reads...
2256         x = startx;
2257         while (x < endx)
2258         {
2259                 // if this pixel is masked off, it's probably not alone...
2260                 if (!pixelmask[x])
2261                 {
2262                         x++;
2263 #if 1
2264                         if (x + 8 < endx)
2265                         {
2266                                 // the 4-item search must be aligned or else it stalls badly
2267                                 if ((x & 3) && !pixelmask[x]) x++;
2268                                 if ((x & 3) && !pixelmask[x]) x++;
2269                                 if ((x & 3) && !pixelmask[x]) x++;
2270                                 while (*((unsigned int *)pixelmask + x) == 0x00000000)
2271                                         x += 4;
2272                         }
2273 #endif
2274                         for (;!pixelmask[x];x++)
2275                                 ;
2276                         // rather than continue the loop, just check the end variable
2277                         if (x >= endx)
2278                                 break;
2279                 }
2280                 // find length of subspan
2281                 subx = x + 1;
2282 #if 1
2283                 if (x + 8 < endx)
2284                 {
2285                         if ((subx & 3) && pixelmask[subx]) subx++;
2286                         if ((subx & 3) && pixelmask[subx]) subx++;
2287                         if ((subx & 3) && pixelmask[subx]) subx++;
2288                         while (*((unsigned int *)pixelmask + subx) == 0x01010101)
2289                                 subx += 4;
2290                 }
2291 #endif
2292                 for (;pixelmask[subx];subx++)
2293                         ;
2294                 // the checks can overshoot, so make sure to clip it...
2295                 if (subx > endx)
2296                         subx = endx;
2297                 // now that we know the subspan length...  process!
2298                 switch(thread->fb_blendmode)
2299                 {
2300                 case DPSOFTRAST_BLENDMODE_OPAQUE:
2301 #if 0
2302                         if (subx - x >= 16)
2303                         {
2304                                 memcpy(pixeli + x, ini + x, (subx - x) * sizeof(pixeli[x]));
2305                                 x = subx;
2306                         }
2307                         else
2308 #elif 1
2309                         while (x + 16 <= subx)
2310                         {
2311                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2312                                 _mm_storeu_si128((__m128i *)&pixeli[x+4], _mm_loadu_si128((const __m128i *)&ini[x+4]));
2313                                 _mm_storeu_si128((__m128i *)&pixeli[x+8], _mm_loadu_si128((const __m128i *)&ini[x+8]));
2314                                 _mm_storeu_si128((__m128i *)&pixeli[x+12], _mm_loadu_si128((const __m128i *)&ini[x+12]));
2315                                 x += 16;
2316                         }
2317 #endif
2318                         {
2319                                 while (x + 4 <= subx)
2320                                 {
2321                                         _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2322                                         x += 4;
2323                                 }
2324                                 if (x + 2 <= subx)
2325                                 {
2326                                         pixeli[x] = ini[x];
2327                                         pixeli[x+1] = ini[x+1];
2328                                         x += 2;
2329                                 }
2330                                 if (x < subx)
2331                                 {
2332                                         pixeli[x] = ini[x];
2333                                         x++;
2334                                 }
2335                         }
2336                         break;
2337                 case DPSOFTRAST_BLENDMODE_ALPHA:
2338                 #define FINISHBLEND(blend2, blend1) \
2339                         for (;x + 1 < subx;x += 2) \
2340                         { \
2341                                 __m128i src, dst; \
2342                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2343                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2344                                 blend2; \
2345                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2346                         } \
2347                         if (x < subx) \
2348                         { \
2349                                 __m128i src, dst; \
2350                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2351                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2352                                 blend1; \
2353                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2354                                 x++; \
2355                         }
2356                         FINISHBLEND({
2357                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2358                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2359                         }, {
2360                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2361                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2362                         });
2363                         break;
2364                 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2365                         FINISHBLEND({
2366                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2367                                 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2368                         }, {
2369                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2370                                 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2371                         });
2372                         break;
2373                 case DPSOFTRAST_BLENDMODE_ADD:
2374                         FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2375                         break;
2376                 case DPSOFTRAST_BLENDMODE_INVMOD:
2377                         FINISHBLEND({
2378                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2379                         }, {
2380                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2381                         });
2382                         break;
2383                 case DPSOFTRAST_BLENDMODE_MUL:
2384                         FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2385                         break;
2386                 case DPSOFTRAST_BLENDMODE_MUL2:
2387                         FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2388                         break;
2389                 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2390                         FINISHBLEND({
2391                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2392                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2393                         }, {
2394                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2395                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2396                         });
2397                         break;
2398                 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2399                         FINISHBLEND({
2400                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2401                                 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2402                         }, {
2403                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2404                                 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2405                         });
2406                         break;
2407                 case DPSOFTRAST_BLENDMODE_INVADD:
2408                         FINISHBLEND({
2409                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2410                         }, {
2411                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2412                         });
2413                         break;
2414                 }
2415         }
2416 #endif
2417 }
2418
2419 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2420 {
2421         int x;
2422         int startx = span->startx;
2423         int endx = span->endx;
2424         int flags;
2425         float c[4];
2426         float data[4];
2427         float slope[4];
2428         float tc[2], endtc[2];
2429         float tcscale[2];
2430         unsigned int tci[2];
2431         unsigned int tci1[2];
2432         unsigned int tcimin[2];
2433         unsigned int tcimax[2];
2434         int tciwrapmask[2];
2435         int tciwidth;
2436         int filter;
2437         int mip;
2438         const unsigned char * RESTRICT pixelbase;
2439         const unsigned char * RESTRICT pixel[4];
2440         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2441         // if no texture is bound, just fill it with white
2442         if (!texture)
2443         {
2444                 for (x = startx;x < endx;x++)
2445                 {
2446                         out4f[x*4+0] = 1.0f;
2447                         out4f[x*4+1] = 1.0f;
2448                         out4f[x*4+2] = 1.0f;
2449                         out4f[x*4+3] = 1.0f;
2450                 }
2451                 return;
2452         }
2453         mip = triangle->mip[texunitindex];
2454         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2455         // if this mipmap of the texture is 1 pixel, just fill it with that color
2456         if (texture->mipmap[mip][1] == 4)
2457         {
2458                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2459                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2460                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2461                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2462                 for (x = startx;x < endx;x++)
2463                 {
2464                         out4f[x*4+0] = c[0];
2465                         out4f[x*4+1] = c[1];
2466                         out4f[x*4+2] = c[2];
2467                         out4f[x*4+3] = c[3];
2468                 }
2469                 return;
2470         }
2471         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2472         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2473         flags = texture->flags;
2474         tcscale[0] = texture->mipmap[mip][2];
2475         tcscale[1] = texture->mipmap[mip][3];
2476         tciwidth = texture->mipmap[mip][2];
2477         tcimin[0] = 0;
2478         tcimin[1] = 0;
2479         tcimax[0] = texture->mipmap[mip][2]-1;
2480         tcimax[1] = texture->mipmap[mip][3]-1;
2481         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2482         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2483         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
2484         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
2485         if (filter)
2486         {
2487                 endtc[0] -= 0.5f;
2488                 endtc[1] -= 0.5f;
2489         }
2490         for (x = startx;x < endx;)
2491         {
2492                 unsigned int subtc[2];
2493                 unsigned int substep[2];
2494                 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2495                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2496                 if (nextsub >= endx)
2497                 {
2498                         nextsub = endsub = endx-1;      
2499                         if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2500                 }
2501                 tc[0] = endtc[0];
2502                 tc[1] = endtc[1];
2503                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2504                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2505                 if (filter)
2506                 {
2507                         endtc[0] -= 0.5f;
2508                         endtc[1] -= 0.5f;
2509                 }
2510                 substep[0] = (endtc[0] - tc[0]) * subscale;
2511                 substep[1] = (endtc[1] - tc[1]) * subscale;
2512                 subtc[0] = tc[0] * (1<<12);
2513                 subtc[1] = tc[1] * (1<<12);
2514                 if (filter)
2515                 {
2516                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2517                         {
2518                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2519                                 {
2520                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2521                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2522                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2523                                         tci[0] = subtc[0]>>12;
2524                                         tci[1] = subtc[1]>>12;
2525                                         tci1[0] = tci[0] + 1;
2526                                         tci1[1] = tci[1] + 1;
2527                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2528                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2529                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2530                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2531                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2532                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2533                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2534                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2535                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2536                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2537                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2538                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2539                                         out4f[x*4+0] = c[0];
2540                                         out4f[x*4+1] = c[1];
2541                                         out4f[x*4+2] = c[2];
2542                                         out4f[x*4+3] = c[3];
2543                                 }
2544                         }
2545                         else
2546                         {
2547                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2548                                 {
2549                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2550                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2551                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2552                                         tci[0] = subtc[0]>>12;
2553                                         tci[1] = subtc[1]>>12;
2554                                         tci1[0] = tci[0] + 1;
2555                                         tci1[1] = tci[1] + 1;
2556                                         tci[0] &= tciwrapmask[0];
2557                                         tci[1] &= tciwrapmask[1];
2558                                         tci1[0] &= tciwrapmask[0];
2559                                         tci1[1] &= tciwrapmask[1];
2560                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2561                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2562                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2563                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2564                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2565                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2566                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2567                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2568                                         out4f[x*4+0] = c[0];
2569                                         out4f[x*4+1] = c[1];
2570                                         out4f[x*4+2] = c[2];
2571                                         out4f[x*4+3] = c[3];
2572                                 }
2573                         }
2574                 }
2575                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2576                 {
2577                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2578                         {
2579                                 tci[0] = subtc[0]>>12;
2580                                 tci[1] = subtc[1]>>12;
2581                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2582                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2583                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2584                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2585                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2586                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2587                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2588                                 out4f[x*4+0] = c[0];
2589                                 out4f[x*4+1] = c[1];
2590                                 out4f[x*4+2] = c[2];
2591                                 out4f[x*4+3] = c[3];
2592                         }
2593                 }
2594                 else
2595                 {
2596                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2597                         {
2598                                 tci[0] = subtc[0]>>12;
2599                                 tci[1] = subtc[1]>>12;
2600                                 tci[0] &= tciwrapmask[0];
2601                                 tci[1] &= tciwrapmask[1];
2602                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2603                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2604                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2605                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2606                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2607                                 out4f[x*4+0] = c[0];
2608                                 out4f[x*4+1] = c[1];
2609                                 out4f[x*4+2] = c[2];
2610                                 out4f[x*4+3] = c[3];
2611                         }
2612                 }
2613         }
2614 }
2615
2616 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2617 {
2618 #ifdef SSE_POSSIBLE
2619         int x;
2620         int startx = span->startx;
2621         int endx = span->endx;
2622         int flags;
2623         __m128 data, slope, tcscale;
2624         __m128i tcsize, tcmask, tcoffset, tcmax;
2625         __m128 tc, endtc;
2626         __m128i subtc, substep, endsubtc;
2627         int filter;
2628         int mip;
2629         unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2630         const unsigned char * RESTRICT pixelbase;
2631         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2632         // if no texture is bound, just fill it with white
2633         if (!texture)
2634         {
2635                 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2636                 return;
2637         }
2638         mip = triangle->mip[texunitindex];
2639         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2640         // if this mipmap of the texture is 1 pixel, just fill it with that color
2641         if (texture->mipmap[mip][1] == 4)
2642         {
2643                 unsigned int k = *((const unsigned int *)pixelbase);
2644                 for (x = startx;x < endx;x++)
2645                         outi[x] = k;
2646                 return;
2647         }
2648         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2649         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2650         flags = texture->flags;
2651         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2652         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2653         tcscale = _mm_cvtepi32_ps(tcsize);
2654         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2655         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2656         endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2657         if (filter)
2658                 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2659         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2660         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2661         tcmax = _mm_packs_epi32(tcmask, tcmask);
2662         for (x = startx;x < endx;)
2663         {
2664                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2665                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2666                 if (nextsub >= endx)
2667                 {
2668                         nextsub = endsub = endx-1;
2669                         if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2670                 }       
2671                 tc = endtc;
2672                 subtc = endsubtc;
2673                 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2674                 if (filter)
2675                         endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2676                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2677                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2678                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2679                 substep = _mm_slli_epi32(substep, 1);
2680                 if (filter)
2681                 {
2682                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2683                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2684                         {
2685                                 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2686                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2687                                 {
2688                                         const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;                   
2689                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2690                                         tci = _mm_madd_epi16(tci, tcoffset);
2691                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2692                                         ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2693                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2694                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2695                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2696                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2697                                         fracm = _mm_srli_epi16(subtc, 1);
2698                                         pix1 = _mm_add_epi16(pix1,
2699                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2700                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2701                                         pix3 = _mm_add_epi16(pix3,
2702                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2703                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2704                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2705                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2706                                         pix2 = _mm_add_epi16(pix2,
2707                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2708                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2709                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2710                                 }
2711                                 if (x <= endsub)
2712                                 {
2713                                         const unsigned char * RESTRICT ptr1;
2714                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2715                                         tci = _mm_madd_epi16(tci, tcoffset);
2716                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2717                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2718                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2719                                         fracm = _mm_srli_epi16(subtc, 1);
2720                                         pix1 = _mm_add_epi16(pix1,
2721                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2722                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2723                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2724                                         pix1 = _mm_add_epi16(pix1,
2725                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2726                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2727                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2728                                         x++;
2729                                 }
2730                         }
2731                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2732                         {
2733                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2734                                 {
2735                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2736                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2737                                         tci = _mm_madd_epi16(tci, tcoffset);
2738                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2739                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2740                                                                                         _mm_setzero_si128());
2741                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2742                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2743                                                                                         _mm_setzero_si128());
2744                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2745                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2746                                         tci = _mm_madd_epi16(tci, tcoffset);
2747                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2748                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2749                                                                                         _mm_setzero_si128());
2750                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2751                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2752                                                                                         _mm_setzero_si128());
2753                                         fracm = _mm_srli_epi16(subtc, 1);
2754                                         pix1 = _mm_add_epi16(pix1,
2755                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2756                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2757                                         pix3 = _mm_add_epi16(pix3,
2758                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2759                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2760                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2761                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2762                                         pix2 = _mm_add_epi16(pix2,
2763                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2764                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2765                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2766                                 }
2767                                 if (x <= endsub)
2768                                 {
2769                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2770                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2771                                         tci = _mm_madd_epi16(tci, tcoffset);
2772                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2773                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2774                                                                                         _mm_setzero_si128());
2775                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2776                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2777                                                                                         _mm_setzero_si128());
2778                                         fracm = _mm_srli_epi16(subtc, 1);
2779                                         pix1 = _mm_add_epi16(pix1,
2780                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2781                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2782                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2783                                         pix1 = _mm_add_epi16(pix1,
2784                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2785                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2786                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2787                                         x++;
2788                                 }
2789                         }
2790                         else
2791                         {
2792                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2793                                 {
2794                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2795                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2796                                         tci = _mm_madd_epi16(tci, tcoffset);
2797                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2798                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2799                                                                                         _mm_setzero_si128());
2800                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2801                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2802                                                                                         _mm_setzero_si128());
2803                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2804                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2805                                         tci = _mm_madd_epi16(tci, tcoffset);
2806                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2807                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2808                                                                                         _mm_setzero_si128());
2809                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2810                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2811                                                                                         _mm_setzero_si128());
2812                                         fracm = _mm_srli_epi16(subtc, 1);
2813                                         pix1 = _mm_add_epi16(pix1,
2814                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2815                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2816                                         pix3 = _mm_add_epi16(pix3,
2817                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2818                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2819                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2820                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2821                                         pix2 = _mm_add_epi16(pix2,
2822                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2823                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2824                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2825                                 }
2826                                 if (x <= endsub)
2827                                 {
2828                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2829                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2830                                         tci = _mm_madd_epi16(tci, tcoffset);
2831                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2832                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2833                                                                                         _mm_setzero_si128());
2834                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2835                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2836                                                                                         _mm_setzero_si128());
2837                                         fracm = _mm_srli_epi16(subtc, 1);
2838                                         pix1 = _mm_add_epi16(pix1,
2839                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2840                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2841                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2842                                         pix1 = _mm_add_epi16(pix1,
2843                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2844                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2845                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2846                                         x++;
2847                                 }
2848                         }
2849                 }
2850                 else
2851                 {
2852                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2853                         {
2854                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2855                                 {
2856                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2857                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2858                                         tci = _mm_madd_epi16(tci, tcoffset);
2859                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2860                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2861                                 }
2862                                 if (x <= endsub)
2863                                 {
2864                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2865                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2866                                         tci = _mm_madd_epi16(tci, tcoffset);
2867                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2868                                         x++;
2869                                 }
2870                         }
2871                         else
2872                         {
2873                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2874                                 {
2875                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2876                                         tci = _mm_and_si128(tci, tcmax); 
2877                                         tci = _mm_madd_epi16(tci, tcoffset);
2878                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2879                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2880                                 }
2881                                 if (x <= endsub)
2882                                 {
2883                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2884                                         tci = _mm_and_si128(tci, tcmax); 
2885                                         tci = _mm_madd_epi16(tci, tcoffset);
2886                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2887                                         x++;
2888                                 }
2889                         }
2890                 }
2891         }
2892 #endif
2893 }
2894
2895 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2896 {
2897         // TODO: IMPLEMENT
2898         memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2899 }
2900
2901 float DPSOFTRAST_SampleShadowmap(const float *vector)
2902 {
2903         // TODO: IMPLEMENT
2904         return 1.0f;
2905 }
2906
2907 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2908 {
2909         int x;
2910         int startx = span->startx;
2911         int endx = span->endx;
2912         float c[4];
2913         float data[4];
2914         float slope[4];
2915         float z;
2916         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2917         for (x = startx;x < endx;x++)
2918         {
2919                 z = zf[x];
2920                 c[0] = (data[0] + slope[0]*x) * z;
2921                 c[1] = (data[1] + slope[1]*x) * z;
2922                 c[2] = (data[2] + slope[2]*x) * z;
2923                 c[3] = (data[3] + slope[3]*x) * z;
2924                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2925                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2926                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2927                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2928         }
2929 }
2930
2931 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2932 {
2933         int x;
2934         int startx = span->startx;
2935         int endx = span->endx;
2936         float c[4];
2937         float data[4];
2938         float slope[4];
2939         float z;
2940         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2941         for (x = startx;x < endx;x++)
2942         {
2943                 z = zf[x];
2944                 c[0] = (data[0] + slope[0]*x) * z;
2945                 c[1] = (data[1] + slope[1]*x) * z;
2946                 c[2] = (data[2] + slope[2]*x) * z;
2947                 c[3] = (data[3] + slope[3]*x) * z;
2948                 out4f[x*4+0] = c[0];
2949                 out4f[x*4+1] = c[1];
2950                 out4f[x*4+2] = c[2];
2951                 out4f[x*4+3] = c[3];
2952         }
2953 }
2954
2955 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2956 {
2957         int x, startx = span->startx, endx = span->endx;
2958         float c[4], localcolor[4];
2959         localcolor[0] = subcolor[0];
2960         localcolor[1] = subcolor[1];
2961         localcolor[2] = subcolor[2];
2962         localcolor[3] = subcolor[3];
2963         for (x = startx;x < endx;x++)
2964         {
2965                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2966                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2967                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2968                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2969                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2970                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2971                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2972                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2973         }
2974 }
2975
2976 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2977 {
2978         int x, startx = span->startx, endx = span->endx;
2979         for (x = startx;x < endx;x++)
2980         {
2981                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2982                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2983                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2984                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2985         }
2986 }
2987
2988 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2989 {
2990         int x, startx = span->startx, endx = span->endx;
2991         for (x = startx;x < endx;x++)
2992         {
2993                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2994                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2995                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2996                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2997         }
2998 }
2999
3000 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
3001 {
3002         int x, startx = span->startx, endx = span->endx;
3003         float a, b;
3004         for (x = startx;x < endx;x++)
3005         {
3006                 a = 1.0f - inb4f[x*4+3];
3007                 b = inb4f[x*4+3];
3008                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
3009                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
3010                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
3011                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
3012         }
3013 }
3014
3015 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
3016 {
3017         int x, startx = span->startx, endx = span->endx;
3018         float localcolor[4], ilerp, lerp;
3019         localcolor[0] = color[0];
3020         localcolor[1] = color[1];
3021         localcolor[2] = color[2];
3022         localcolor[3] = color[3];
3023         ilerp = 1.0f - localcolor[3];
3024         lerp = localcolor[3];
3025         for (x = startx;x < endx;x++)
3026         {
3027                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
3028                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
3029                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
3030                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
3031         }
3032 }
3033
3034
3035
3036 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
3037 {
3038 #ifdef SSE_POSSIBLE
3039         int x;
3040         int startx = span->startx;
3041         int endx = span->endx;
3042         __m128 data, slope;
3043         __m128 mod, endmod;
3044         __m128i submod, substep, endsubmod;
3045         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3046         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3047         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3048         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3049         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3050         for (x = startx; x < endx;)
3051         {
3052                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3053                 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3054                 if (nextsub >= endx)
3055                 {
3056                         nextsub = endsub = endx-1;
3057                         if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
3058                 }
3059                 mod = endmod;
3060                 submod = endsubmod;
3061                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3062                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3063                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3064                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3065                 substep = _mm_packs_epi32(substep, substep);
3066                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3067                 {
3068                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
3069                         pix = _mm_mulhi_epu16(pix, submod);
3070                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3071                 }
3072                 if (x <= endsub)
3073                 {
3074                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
3075                         pix = _mm_mulhi_epu16(pix, submod);
3076                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3077                         x++;
3078                 }
3079         }
3080 #endif
3081 }
3082
3083 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
3084 {
3085 #ifdef SSE_POSSIBLE
3086         int x;
3087         int startx = span->startx;
3088         int endx = span->endx;
3089         __m128 data, slope;
3090         __m128 mod, endmod;
3091         __m128i submod, substep, endsubmod;
3092         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3093         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3094         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3095         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3096         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3097         for (x = startx; x < endx;)
3098         {
3099                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3100                 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3101                 if (nextsub >= endx)
3102                 {
3103                         nextsub = endsub = endx-1;
3104                         if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
3105                 }
3106                 mod = endmod;
3107                 submod = endsubmod;
3108                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3109                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3110                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3111                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3112                 substep = _mm_packs_epi32(substep, substep);
3113                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3114                 {
3115                         __m128i pix = _mm_srai_epi16(submod, 4);
3116                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3117                 }
3118                 if (x <= endsub)
3119                 {
3120                         __m128i pix = _mm_srai_epi16(submod, 4);
3121                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3122                         x++;
3123                 }
3124         }
3125 #endif
3126 }
3127
3128 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3129 {
3130 #ifdef SSE_POSSIBLE
3131         int x, startx = span->startx, endx = span->endx;
3132         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3133         localcolor = _mm_packs_epi32(localcolor, localcolor);
3134         for (x = startx;x+2 <= endx;x+=2)
3135         {
3136                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3137                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3138                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3139                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3140         }
3141         if (x < endx)
3142         {
3143                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3144                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3145                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3146                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3147         }
3148 #endif
3149 }
3150
3151 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3152 {
3153 #ifdef SSE_POSSIBLE
3154         int x, startx = span->startx, endx = span->endx;
3155         for (x = startx;x+2 <= endx;x+=2)
3156         {
3157                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3158                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3159                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3160                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3161         }
3162         if (x < endx)
3163         {
3164                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3165                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3166                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3167                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3168         }
3169 #endif
3170 }
3171
3172 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3173 {
3174 #ifdef SSE_POSSIBLE
3175         int x, startx = span->startx, endx = span->endx;
3176         for (x = startx;x+2 <= endx;x+=2)
3177         {
3178                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3179                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3180                 pix1 = _mm_add_epi16(pix1, pix2);
3181                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3182         }
3183         if (x < endx)
3184         {
3185                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3186                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3187                 pix1 = _mm_add_epi16(pix1, pix2);
3188                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3189         }
3190 #endif
3191 }
3192
3193 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3194 {
3195 #ifdef SSE_POSSIBLE
3196         int x, startx = span->startx, endx = span->endx;
3197         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3198         tint = _mm_packs_epi32(tint, tint);
3199         for (x = startx;x+2 <= endx;x+=2)
3200         {
3201                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3202                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3203                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3204                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3205         }
3206         if (x < endx)
3207         {
3208                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3209                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3210                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3211                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3212         }
3213 #endif
3214 }
3215
3216 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3217 {
3218 #ifdef SSE_POSSIBLE
3219         int x, startx = span->startx, endx = span->endx;
3220         for (x = startx;x+2 <= endx;x+=2)
3221         {
3222                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3223                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3224                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3225                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3226                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3227         }
3228         if (x < endx)
3229         {
3230                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3231                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3232                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3233                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3234                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3235         }
3236 #endif
3237 }
3238
3239 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3240 {
3241 #ifdef SSE_POSSIBLE
3242         int x, startx = span->startx, endx = span->endx;
3243         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3244         localcolor = _mm_packs_epi32(localcolor, localcolor);
3245         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3246         for (x = startx;x+2 <= endx;x+=2)
3247         {
3248                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3249                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3250                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3251         }
3252         if (x < endx)
3253         {
3254                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3255                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3256                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3257         }
3258 #endif
3259 }
3260
3261
3262
3263 void DPSOFTRAST_VertexShader_Generic(void)
3264 {
3265         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3266         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3267         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3268         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3269                 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3270 }
3271
3272 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3273 {
3274         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3275         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3276         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3277         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3278         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3279         if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3280         {
3281                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3282                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3283                 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3284                 {
3285                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3286                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3287                         {
3288                                 // multiply
3289                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3290                         }
3291                         else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3292                         {
3293                                 // add
3294                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3295                         }
3296                         else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3297                         {
3298                                 // alphablend
3299                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3300                         }
3301                 }
3302         }
3303         else
3304                 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3305         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3306 }
3307
3308
3309
3310 void DPSOFTRAST_VertexShader_PostProcess(void)
3311 {
3312         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3313         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3314         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3315 }
3316
3317 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3318 {
3319         // TODO: optimize!!  at the very least there is no reason to use texture sampling on the frame texture
3320         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3321         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3322         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3323         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3324         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3325         if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3326         {
3327                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3328                 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3329         }
3330         DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3331         if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3332         {
3333                 // TODO: implement saturation
3334         }
3335         if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3336         {
3337                 // TODO: implement gammaramps
3338         }
3339         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3340 }
3341
3342
3343
3344 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3345 {
3346         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3347 }
3348
3349 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3350 {
3351         // this is never called (because colormask is off when this shader is used)
3352         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3353         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3354         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3355         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3356         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3357 }
3358
3359
3360
3361 void DPSOFTRAST_VertexShader_FlatColor(void)
3362 {
3363         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3364         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3365 }
3366
3367 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3368 {
3369 #ifdef SSE_POSSIBLE
3370         unsigned char * RESTRICT pixelmask = span->pixelmask;
3371         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3372         int x, startx = span->startx, endx = span->endx;
3373         __m128i Color_Ambientm;
3374         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3375         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3376         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3377         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3378         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3379         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3380                 pixel = buffer_FragColorbgra8;
3381         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3382         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3383         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3384         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3385         for (x = startx;x < endx;x++)
3386         {
3387                 __m128i color, pix;
3388                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3389                 {
3390                         __m128i pix2;
3391                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3392                         pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3393                         pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3394                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3395                         x += 3;
3396                         continue;
3397                 }
3398                 if (!pixelmask[x])
3399                         continue;
3400                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3401                 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3402                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3403         }
3404         if (pixel == buffer_FragColorbgra8)
3405                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3406 #endif
3407 }
3408
3409
3410
3411 void DPSOFTRAST_VertexShader_VertexColor(void)
3412 {
3413         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3414         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3415         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3416 }
3417
3418 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3419 {
3420 #ifdef SSE_POSSIBLE
3421         unsigned char * RESTRICT pixelmask = span->pixelmask;
3422         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3423         int x, startx = span->startx, endx = span->endx;
3424         __m128i Color_Ambientm, Color_Diffusem;
3425         __m128 data, slope;
3426         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3427         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3428         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3429         int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3430         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3431         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3432         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3433                 pixel = buffer_FragColorbgra8;
3434         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3435         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3436         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3437         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3438         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3439         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3440         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3441         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3442         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3443         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3444         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3445         data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3446         slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3447         for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3448         {
3449                 __m128i color, mod, pix;
3450                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3451                 {
3452                         __m128i pix2, mod2;
3453                         __m128 z = _mm_loadu_ps(&buffer_z[x]);
3454                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3455                         mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3456                         data = _mm_add_ps(data, slope);
3457                         mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3458                         data = _mm_add_ps(data, slope);
3459                         mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3460                         data = _mm_add_ps(data, slope);
3461                         mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3462                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3463                                                                   _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3464                         pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3465                                                                    _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3466                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3467                         x += 3;
3468                         continue;
3469                 }
3470                 if (!pixelmask[x])
3471                         continue;
3472                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3473                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
3474                 mod = _mm_packs_epi32(mod, mod);
3475                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3476                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3477         }
3478         if (pixel == buffer_FragColorbgra8)
3479                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3480 #endif
3481 }
3482
3483
3484
3485 void DPSOFTRAST_VertexShader_Lightmap(void)
3486 {
3487         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3488         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3489         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3490 }
3491
3492 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3493 {
3494 #ifdef SSE_POSSIBLE
3495         unsigned char * RESTRICT pixelmask = span->pixelmask;
3496         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3497         int x, startx = span->startx, endx = span->endx;
3498         __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3499         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3500         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3501         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3502         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3503         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3504         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3505         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3506         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3507         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3508                 pixel = buffer_FragColorbgra8;
3509         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3510         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3511         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3512         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3513         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3514         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3515         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3516         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3517         {
3518                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3519                 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3520                 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3521                 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3522                 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3523                 for (x = startx;x < endx;x++)
3524                 {
3525                         __m128i color, lightmap, glow, pix;
3526                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3527                         {
3528                                 __m128i pix2;
3529                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3530                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3531                                 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3532                                 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3533                                                                                                         _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3534                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3535                                 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3536                                                                                                         _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3537                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3538                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3539                                 x += 3;
3540                                 continue;
3541                         }
3542                         if (!pixelmask[x])
3543                                 continue;
3544                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3545                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3546                         glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3547                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3548                         pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3549                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3550                 }
3551         }
3552         else
3553         {
3554                 for (x = startx;x < endx;x++)
3555                 {
3556                         __m128i color, lightmap, pix;
3557                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3558                         {
3559                                 __m128i pix2;
3560                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3561                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3562                                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3563                                                                           _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3564                                 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3565                                                                            _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3566                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3567                                 x += 3;
3568                                 continue;
3569                         }
3570                         if (!pixelmask[x]) 
3571                                 continue;
3572                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3573                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3574                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3575                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3576                 }
3577         }
3578         if (pixel == buffer_FragColorbgra8)
3579                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3580 #endif
3581 }
3582
3583
3584 void DPSOFTRAST_VertexShader_LightDirection(void);
3585 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3586
3587 void DPSOFTRAST_VertexShader_FakeLight(void)
3588 {
3589         DPSOFTRAST_VertexShader_LightDirection();
3590 }
3591
3592 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3593 {
3594         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3595 }
3596
3597
3598
3599 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3600 {
3601         DPSOFTRAST_VertexShader_LightDirection();
3602         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3603 }
3604
3605 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3606 {
3607         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3608 }
3609
3610
3611
3612 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3613 {
3614         DPSOFTRAST_VertexShader_LightDirection();
3615         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3616 }
3617
3618 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3619 {
3620         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3621 }
3622
3623
3624
3625 void DPSOFTRAST_VertexShader_LightDirection(void)
3626 {
3627         int i;
3628         int numvertices = dpsoftrast.numvertices;
3629         float LightDir[4];
3630         float LightVector[4];
3631         float EyePosition[4];
3632         float EyeVectorModelSpace[4];
3633         float EyeVector[4];
3634         float position[4];
3635         float svector[4];
3636         float tvector[4];
3637         float normal[4];
3638         LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3639         LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3640         LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3641         LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3642         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3643         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3644         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3645         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3646         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3647         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3648         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3649         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3650         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3651         for (i = 0;i < numvertices;i++)
3652         {
3653                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3654                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3655                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3656                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3657                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3658                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3659                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3660                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3661                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3662                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3663                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3664                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3665                 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3666                 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3667                 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3668                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3669                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3670                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3671                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3672                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3673                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3674                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3675                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3676                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3677                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3678                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3679                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3680                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3681                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3682         }
3683         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3684 }
3685
3686 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3687 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3688 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3689 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3690 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3691 #define DPSOFTRAST_Vector3Normalize(v)\
3692 do\
3693 {\
3694         float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3695         if (len)\
3696         {\
3697                 len = 1.0f / len;\
3698                 v[0] *= len;\
3699                 v[1] *= len;\
3700                 v[2] *= len;\
3701         }\
3702 }\
3703 while(0)
3704
3705 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3706 {
3707         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3708         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3709         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3710         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3711         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3712         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3713         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3714         unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3715         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3716         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3717         int x, startx = span->startx, endx = span->endx;
3718         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3719         float LightVectordata[4];
3720         float LightVectorslope[4];
3721         float EyeVectordata[4];
3722         float EyeVectorslope[4];
3723         float VectorSdata[4];
3724         float VectorSslope[4];
3725         float VectorTdata[4];
3726         float VectorTslope[4];
3727         float VectorRdata[4];
3728         float VectorRslope[4];
3729         float z;
3730         float diffusetex[4];
3731         float glosstex[4];
3732         float surfacenormal[4];
3733         float lightnormal[4];
3734         float lightnormal_modelspace[4];
3735         float eyenormal[4];
3736         float specularnormal[4];
3737         float diffuse;
3738         float specular;
3739         float SpecularPower;
3740         int d[4];
3741         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3742         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3743         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3744         Color_Glow[3] = 0.0f;
3745         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3746         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3747         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3748         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3749         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3750         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3751         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3752         Color_Pants[3] = 0.0f;
3753         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3754         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3755         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3756         Color_Shirt[3] = 0.0f;
3757         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3758         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3759         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3760         {
3761                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3762                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3763         }
3764         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3765         {
3766                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3767         }
3768         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3769         {
3770                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3771                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3772                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3773                 Color_Diffuse[3] = 0.0f;
3774                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3775                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3776                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3777                 LightColor[3] = 0.0f;
3778                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3779                 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3780                 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3781                 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3782                 Color_Specular[3] = 0.0f;
3783                 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3784                 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3785                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3786
3787                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3788                 {
3789                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3790                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3791                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3792                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3793                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3794                 }
3795                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3796                 {
3797                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3798                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3799                 }
3800                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3801                 {
3802                         // nothing of this needed
3803                 }
3804                 else
3805                 {
3806                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3807                 }
3808
3809                 for (x = startx;x < endx;x++)
3810                 {
3811                         z = buffer_z[x];
3812                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3813                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3814                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3815                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3816                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3817                         {
3818                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3819                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3820                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3821                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3822                         }
3823                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3824                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3825                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3826                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3827                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3828                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3829                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3830                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3831
3832                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3833                         {
3834                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3835                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3836                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3837                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3838
3839                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3840                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3841                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3842                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3843
3844                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3845                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3846                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3847                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3848
3849                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3850                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3851                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3852                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3853
3854                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3855                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3856
3857                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3858                                 {
3859                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3860                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3861                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3862                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3863                                 }
3864                         }
3865                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3866                         {
3867                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3868                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3869                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3870                                 {
3871                                         float f = 1.0f / 256.0f;
3872                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3873                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3874                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3875                                 }
3876                         }
3877                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3878                         {
3879                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3880                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3881                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3882                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3883
3884                                 LightColor[0] = 1.0;
3885                                 LightColor[1] = 1.0;
3886                                 LightColor[2] = 1.0;
3887                         }
3888                         else
3889                         {
3890                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3891                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3892                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3893                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3894                         }
3895
3896                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3897
3898                         if(thread->shader_exactspecularmath)
3899                         {
3900                                 // reflect lightnormal at surfacenormal, take the negative of that
3901                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3902                                 float f;
3903                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3904                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3905                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3906                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3907
3908                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
3909                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3910                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3911                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3912                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3913
3914                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3915                         }
3916                         else
3917                         {
3918                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3919                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3920                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3921                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3922
3923                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
3924                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
3925                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
3926                                 DPSOFTRAST_Vector3Normalize(specularnormal);
3927
3928                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3929                         }
3930
3931                         specular = pow(specular, SpecularPower * glosstex[3]);
3932                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3933                         {
3934                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3935                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3936                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3937                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3938                         }
3939                         else
3940                         {
3941                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3942                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3943                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3944                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3945                         }
3946
3947                         buffer_FragColorbgra8[x*4+0] = d[0];
3948                         buffer_FragColorbgra8[x*4+1] = d[1];
3949                         buffer_FragColorbgra8[x*4+2] = d[2];
3950                         buffer_FragColorbgra8[x*4+3] = d[3];
3951                 }
3952         }
3953         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3954         {
3955                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3956                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3957                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3958                 Color_Diffuse[3] = 0.0f;
3959                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3960                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3961                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3962                 LightColor[3] = 0.0f;
3963                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3964
3965                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3966                 {
3967                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3968                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3969                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3970                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3971                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3972                 }
3973                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3974                 {
3975                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3976                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3977                 }
3978                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3979                 {
3980                         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3981                 }
3982                 else
3983                 {
3984                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3985                 }
3986
3987                 for (x = startx;x < endx;x++)
3988                 {
3989                         z = buffer_z[x];
3990                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3991                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3992                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3993                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3994                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3995                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3996                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3997                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3998
3999                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
4000                         {
4001                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
4002                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4003                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4004                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4005
4006                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
4007                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
4008                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
4009                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
4010
4011                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
4012                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
4013                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
4014                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
4015
4016                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
4017                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
4018                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
4019                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
4020
4021                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
4022                                 DPSOFTRAST_Vector3Normalize(lightnormal);
4023
4024                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
4025                                 {
4026                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
4027                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
4028                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
4029                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
4030                                 }
4031                         }
4032                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
4033                         {
4034                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4035                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4036                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4037                                 {
4038                                         float f = 1.0f / 256.0f;
4039                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
4040                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
4041                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
4042                                 }
4043                         }
4044                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
4045                         {
4046                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4047                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4048                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4049                                 DPSOFTRAST_Vector3Normalize(lightnormal);
4050
4051                                 LightColor[0] = 1.0;
4052                                 LightColor[1] = 1.0;
4053                                 LightColor[2] = 1.0;
4054                         }
4055                         else
4056                         {
4057                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4058                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4059                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4060                                 DPSOFTRAST_Vector3Normalize(lightnormal);
4061                         }
4062
4063                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4064                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4065                         {
4066                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4067                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4068                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4069                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
4070                         }
4071                         else
4072                         {
4073                                 d[0] = (int)(                                                + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4074                                 d[1] = (int)(                                                + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4075                                 d[2] = (int)(                                                + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4076                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
4077                         }
4078                         buffer_FragColorbgra8[x*4+0] = d[0];
4079                         buffer_FragColorbgra8[x*4+1] = d[1];
4080                         buffer_FragColorbgra8[x*4+2] = d[2];
4081                         buffer_FragColorbgra8[x*4+3] = d[3];
4082                 }
4083         }
4084         else
4085         {
4086                 for (x = startx;x < endx;x++)
4087                 {
4088                         z = buffer_z[x];
4089                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4090                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4091                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4092                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4093
4094                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4095                         {
4096                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4097                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4098                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4099                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4100                         }
4101                         else
4102                         {
4103                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4104                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4105                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4106                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4107                         }
4108                         buffer_FragColorbgra8[x*4+0] = d[0];
4109                         buffer_FragColorbgra8[x*4+1] = d[1];
4110                         buffer_FragColorbgra8[x*4+2] = d[2];
4111                         buffer_FragColorbgra8[x*4+3] = d[3];
4112                 }
4113         }
4114         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4115 }
4116
4117
4118
4119 void DPSOFTRAST_VertexShader_LightSource(void)
4120 {
4121         int i;
4122         int numvertices = dpsoftrast.numvertices;
4123         float LightPosition[4];
4124         float LightVector[4];
4125         float LightVectorModelSpace[4];
4126         float EyePosition[4];
4127         float EyeVectorModelSpace[4];
4128         float EyeVector[4];
4129         float position[4];
4130         float svector[4];
4131         float tvector[4];
4132         float normal[4];
4133         LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4134         LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4135         LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4136         LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4137         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4138         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4139         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4140         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4141         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4142         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4143         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4144         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4145         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4146         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4147         for (i = 0;i < numvertices;i++)
4148         {
4149                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4150                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4151                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4152                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4153                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4154                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4155                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4156                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4157                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4158                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4159                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4160                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4161                 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4162                 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4163                 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4164                 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4165                 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4166                 LightVector[2] = normal[0]  * LightVectorModelSpace[0] + normal[1]  * LightVectorModelSpace[1] + normal[2]  * LightVectorModelSpace[2];
4167                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4168                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4169                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4170                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4171                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4172                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4173                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4174                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4175                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4176                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
4177                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4178                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4179                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4180                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4181         }
4182         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4183         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4184 }
4185
4186 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4187 {
4188 #ifdef SSE_POSSIBLE
4189         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4190         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4191         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4192         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4193         unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4194         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4195         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4196         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4197         int x, startx = span->startx, endx = span->endx;
4198         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4199         float CubeVectordata[4];
4200         float CubeVectorslope[4];
4201         float LightVectordata[4];
4202         float LightVectorslope[4];
4203         float EyeVectordata[4];
4204         float EyeVectorslope[4];
4205         float z;
4206         float diffusetex[4];
4207         float glosstex[4];
4208         float surfacenormal[4];
4209         float lightnormal[4];
4210         float eyenormal[4];
4211         float specularnormal[4];
4212         float diffuse;
4213         float specular;
4214         float SpecularPower;
4215         float CubeVector[4];
4216         float attenuation;
4217         int d[4];
4218         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4219         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4220         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4221         Color_Glow[3] = 0.0f;
4222         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4223         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4224         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4225         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4226         Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4227         Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4228         Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4229         Color_Diffuse[3] = 0.0f;
4230         Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4231         Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4232         Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4233         Color_Specular[3] = 0.0f;
4234         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4235         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4236         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4237         Color_Pants[3] = 0.0f;
4238         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4239         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4240         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4241         Color_Shirt[3] = 0.0f;
4242         LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4243         LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4244         LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4245         LightColor[3] = 0.0f;
4246         SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4247         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4248         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4249         DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4250         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4251         memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4252         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4253         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4254         {
4255                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4256                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4257         }
4258         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4259                 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4260         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4261         {
4262                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4263                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4264                 for (x = startx;x < endx;x++)
4265                 {
4266                         z = buffer_z[x];
4267                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4268                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4269                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4270                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4271                         if (attenuation < 0.01f)
4272                                 continue;
4273                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4274                         {
4275                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4276                                 if (attenuation < 0.01f)
4277                                         continue;
4278                         }
4279
4280                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4281                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4282                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4283                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4284                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4285                         {
4286                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4287                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4288                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4289                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4290                         }
4291                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4292                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4293                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4294                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4295                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4296                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4297                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4298                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4299
4300                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4301                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4302                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4303                         DPSOFTRAST_Vector3Normalize(lightnormal);
4304
4305                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4306
4307                         if(thread->shader_exactspecularmath)
4308                         {
4309                                 // reflect lightnormal at surfacenormal, take the negative of that
4310                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4311                                 float f;
4312                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4313                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4314                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4315                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4316
4317                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
4318                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4319                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4320                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4321                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4322
4323                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4324                         }
4325                         else
4326                         {
4327                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4328                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4329                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4330                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4331
4332                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
4333                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
4334                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
4335                                 DPSOFTRAST_Vector3Normalize(specularnormal);
4336
4337                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4338                         }
4339                         specular = pow(specular, SpecularPower * glosstex[3]);
4340
4341                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4342                         {
4343                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4344                                 attenuation *= (1.0f / 255.0f);
4345                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4346                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4347                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4348                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4349                         }
4350                         else
4351                         {
4352                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4353                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4354                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4355                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4356                         }
4357                         buffer_FragColorbgra8[x*4+0] = d[0];
4358                         buffer_FragColorbgra8[x*4+1] = d[1];
4359                         buffer_FragColorbgra8[x*4+2] = d[2];
4360                         buffer_FragColorbgra8[x*4+3] = d[3];
4361                 }
4362         }
4363         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4364         {
4365                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4366                 for (x = startx;x < endx;x++)
4367                 {
4368                         z = buffer_z[x];
4369                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4370                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4371                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4372                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4373                         if (attenuation < 0.01f)
4374                                 continue;
4375                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4376                         {
4377                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4378                                 if (attenuation < 0.01f)
4379                                         continue;
4380                         }
4381
4382                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4383                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4384                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4385                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4386                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4387                         {
4388                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4389                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4390                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4391                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4392                         }
4393                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4394                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4395                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4396                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4397
4398                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4399                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4400                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4401                         DPSOFTRAST_Vector3Normalize(lightnormal);
4402
4403                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4404                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4405                         {
4406                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4407                                 attenuation *= (1.0f / 255.0f);
4408                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4409                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4410                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4411                                 d[3] = (int)( diffusetex[3]                                                                                                                   );if (d[3] > 255) d[3] = 255;
4412                         }
4413                         else
4414                         {
4415                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4416                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4417                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4418                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4419                         }
4420                         buffer_FragColorbgra8[x*4+0] = d[0];
4421                         buffer_FragColorbgra8[x*4+1] = d[1];
4422                         buffer_FragColorbgra8[x*4+2] = d[2];
4423                         buffer_FragColorbgra8[x*4+3] = d[3];
4424                 }
4425         }
4426         else
4427         {
4428                 for (x = startx;x < endx;x++)
4429                 {
4430                         z = buffer_z[x];
4431                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4432                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4433                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4434                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4435                         if (attenuation < 0.01f)
4436                                 continue;
4437                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4438                         {
4439                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4440                                 if (attenuation < 0.01f)
4441                                         continue;
4442                         }
4443
4444                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4445                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4446                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4447                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4448                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4449                         {
4450                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4451                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4452                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4453                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4454                         }
4455                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4456                         {
4457                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4458                                 attenuation *= (1.0f / 255.0f);
4459                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4460                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4461                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4462                                 d[3] = (int)( diffusetex[3]                                                                                      );if (d[3] > 255) d[3] = 255;
4463                         }
4464                         else
4465                         {
4466                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4467                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4468                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4469                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4470                         }
4471                         buffer_FragColorbgra8[x*4+0] = d[0];
4472                         buffer_FragColorbgra8[x*4+1] = d[1];
4473                         buffer_FragColorbgra8[x*4+2] = d[2];
4474                         buffer_FragColorbgra8[x*4+3] = d[3];
4475                 }
4476         }
4477         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4478 #endif
4479 }
4480
4481
4482
4483 void DPSOFTRAST_VertexShader_Refraction(void)
4484 {
4485         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4486         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4487         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4488 }
4489
4490 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4491 {
4492         // DIRTY TRICK: only do sideways displacement. Not correct, but cheaper and thus better for SW.
4493
4494         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4495         float z;
4496         int x, startx = span->startx, endx = span->endx;
4497
4498         // texture reads
4499         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4500         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4501
4502         // varyings
4503         float ModelViewProjectionPositiondata[4];
4504         float ModelViewProjectionPositionslope[4];
4505
4506         // uniforms
4507         float ScreenScaleRefractReflect[2];
4508         float ScreenCenterRefractReflect[2];
4509         float DistortScaleRefractReflect[2];
4510         float RefractColor[4];
4511
4512         const unsigned char * RESTRICT pixelbase;
4513         const unsigned char * RESTRICT pixel[4];
4514         DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4515         if(!texture) return;
4516         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[0][0];
4517
4518         // read textures
4519         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4520         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4521
4522         // read varyings
4523         DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD1); // or POSITION?
4524
4525         // read uniforms
4526         ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4527         ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4528         ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4529         ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4530         DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4531         DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4532         RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4533         RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4534         RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4535         RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4536
4537         // do stuff
4538         for (x = startx;x < endx;x++)
4539         {
4540                 float SafeScreenTexCoord[2];
4541                 float ScreenTexCoord[2];
4542                 float v[3];
4543                 float iw;
4544                 unsigned char c[4];
4545
4546                 z = buffer_z[x];
4547
4548                 // "    vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4549                 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4550                 
4551                 // "    vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4552                 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4553                 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4554
4555                 // "    vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4556                 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4557                 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4558                 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4559                 DPSOFTRAST_Vector3Normalize(v);
4560                 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4561                 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4562
4563                 // "    dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4564                 if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4565                 {
4566                         unsigned int tc[2] = { ScreenTexCoord[0] * (texture->mipmap[0][2]<<12) - 2048, ScreenTexCoord[1] * (texture->mipmap[0][3]<<12) - 2048};
4567                         unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
4568                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
4569                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
4570                         int tci[2] = { tc[0]>>12, tc[1]>>12 };
4571                         int tci1[2] = { tci[0] + 1, tci[1] + 1 };
4572                         tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4573                         tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4574                         tci1[0] = tci1[0] >= 0 ? (tci1[0] <= texture->mipmap[0][2]-1 ? tci1[0] : texture->mipmap[0][2]-1) : 0;
4575                         tci1[1] = tci1[1] >= 0 ? (tci1[1] <= texture->mipmap[0][3]-1 ? tci1[1] : texture->mipmap[0][3]-1) : 0;
4576                         pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4577                         pixel[1] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci1[0]);
4578                         pixel[2] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci[0]);
4579                         pixel[3] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci1[0]);
4580                         c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
4581                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
4582                         c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
4583                 }
4584                 else
4585                 {
4586                         int tci[2] = { ScreenTexCoord[0] * texture->mipmap[0][2], ScreenTexCoord[1] * texture->mipmap[0][3] };
4587                         tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4588                         tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4589                         pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4590                         c[0] = pixel[0][0];
4591                         c[1] = pixel[0][1];
4592                         c[2] = pixel[0][2];
4593                 }
4594
4595                 //p = (int) bound(startx, x + (ScreenTexCoord[0] - SafeScreenTexCoord[0]) / (ModelViewProjectionPositionslope[0]*z), endx-1);
4596                 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4597                 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4598                 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4599                 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4600         }
4601
4602         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4603 }
4604
4605
4606
4607 void DPSOFTRAST_VertexShader_Water(void)
4608 {
4609         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4610 }
4611
4612
4613 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4614 {
4615         // TODO: IMPLEMENT
4616         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4617         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4618         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4619         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4620         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4621 }
4622
4623
4624
4625 void DPSOFTRAST_VertexShader_ShowDepth(void)
4626 {
4627         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4628 }
4629
4630 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4631 {
4632         // TODO: IMPLEMENT
4633         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4634         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4635         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4636         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4637         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4638 }
4639
4640
4641
4642 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4643 {
4644         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4645 }
4646
4647 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4648 {
4649         // TODO: IMPLEMENT
4650         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4651         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4652         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4653         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4654         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4655 }
4656
4657
4658
4659 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4660 {
4661         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4662 }
4663
4664 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4665 {
4666         // TODO: IMPLEMENT
4667         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4668         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4669         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4670         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4671         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4672 }
4673
4674
4675
4676 typedef struct DPSOFTRAST_ShaderModeInfo_s
4677 {
4678         int lodarrayindex;
4679         void (*Vertex)(void);
4680         void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4681         unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4682         unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4683 }
4684 DPSOFTRAST_ShaderModeInfo;
4685
4686 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4687 {
4688         {2, DPSOFTRAST_VertexShader_Generic,                        DPSOFTRAST_PixelShader_Generic,                        {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4689         {2, DPSOFTRAST_VertexShader_PostProcess,                    DPSOFTRAST_PixelShader_PostProcess,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4690         {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow,                DPSOFTRAST_PixelShader_Depth_Or_Shadow,                {~0}, {~0}},
4691         {2, DPSOFTRAST_VertexShader_FlatColor,                      DPSOFTRAST_PixelShader_FlatColor,                      {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4692         {2, DPSOFTRAST_VertexShader_VertexColor,                    DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4693         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4694         {2, DPSOFTRAST_VertexShader_FakeLight,                      DPSOFTRAST_PixelShader_FakeLight,                      {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4695         {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace,   {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4696         {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4697         {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,                 {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4698         {2, DPSOFTRAST_VertexShader_LightSource,                    DPSOFTRAST_PixelShader_LightSource,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4699         {2, DPSOFTRAST_VertexShader_Refraction,                     DPSOFTRAST_PixelShader_Refraction,                     {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4700         {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                          {~0}},
4701         {2, DPSOFTRAST_VertexShader_ShowDepth,                      DPSOFTRAST_PixelShader_ShowDepth,                      {~0}},
4702         {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,               {~0}},
4703         {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}},
4704 };
4705
4706 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4707 {
4708         int i;
4709         int x;
4710         int startx;
4711         int endx;
4712 //      unsigned int c;
4713 //      unsigned int *colorpixel;
4714         unsigned int *depthpixel;
4715         float w;
4716         float wslope;
4717         int depth;
4718         int depthslope;
4719         unsigned int d;
4720         DPSOFTRAST_State_Triangle *triangle;
4721         DPSOFTRAST_State_Span *span;
4722         unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH+4]; // LordHavoc: padded to allow some termination bytes
4723         for (i = 0; i < thread->numspans; i++)
4724         {
4725                 span = &thread->spans[i];
4726                 triangle = &thread->triangles[span->triangle];
4727                 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4728                 {
4729                         wslope = triangle->w[0];
4730                         w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4731                         depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4732                         depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4733                         depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4734                         startx = span->startx;
4735                         endx = span->endx;
4736                         switch(thread->fb_depthfunc)
4737                         {
4738                         default:
4739                         case GL_ALWAYS:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4740                         case GL_LESS:    for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4741                         case GL_LEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4742                         case GL_EQUAL:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4743                         case GL_GEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4744                         case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4745                         case GL_NEVER:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4746                         }
4747                         //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4748                         //for (x = startx;x < endx;x++)
4749                         //      colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4750                         // if there is no color buffer, skip pixel shader
4751                         while (startx < endx && !pixelmask[startx])
4752                                 startx++;
4753                         while (endx > startx && !pixelmask[endx-1])
4754                                 endx--;
4755                         if (startx >= endx)
4756                                 continue; // no pixels to fill
4757                         span->pixelmask = pixelmask;
4758                         span->startx = startx;
4759                         span->endx = endx;
4760                         // run pixel shader if appropriate
4761                         // do this before running depthmask code, to allow the pixelshader
4762                         // to clear pixelmask values for alpha testing
4763                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4764                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4765                         if (thread->depthmask)
4766                                 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4767                                         if (pixelmask[x])
4768                                                 depthpixel[x] = d;
4769                 }
4770                 else
4771                 {
4772                         // no depth testing means we're just dealing with color...
4773                         // if there is no color buffer, skip pixel shader
4774                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4775                         {
4776                                 memset(pixelmask + span->startx, 1, span->endx - span->startx);
4777                                 span->pixelmask = pixelmask;
4778                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4779                         }
4780                 }
4781         }
4782         thread->numspans = 0;
4783 }
4784
4785 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4786
4787 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4788 {
4789 #ifdef SSE_POSSIBLE
4790         int cullface = thread->cullface;
4791         int minx, maxx, miny, maxy;
4792         int miny1, maxy1, miny2, maxy2;
4793         __m128i fbmin, fbmax;
4794         __m128 viewportcenter, viewportscale;
4795         int firstvertex = command->firstvertex;
4796         int numvertices = command->numvertices;
4797         int numtriangles = command->numtriangles;
4798         const int *element3i = command->element3i;
4799         const unsigned short *element3s = command->element3s;
4800         int clipped = command->clipped;
4801         int i;
4802         int j;
4803         int k;
4804         int y;
4805         int e[3];
4806         __m128i screeny;
4807         int starty, endy, bandy;
4808         int numpoints;
4809         int clipcase;
4810         float clipdist[4];
4811         float clip0origin, clip0slope;
4812         int clip0dir;
4813         __m128 triangleedge1, triangleedge2, trianglenormal;
4814         __m128 clipfrac[3];
4815         __m128 screen[4];
4816         DPSOFTRAST_State_Triangle *triangle;
4817         DPSOFTRAST_Texture *texture;
4818         DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4819         miny = thread->fb_scissor[1];
4820         maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4821         miny1 = bound(miny, thread->miny1, maxy);
4822         maxy1 = bound(miny, thread->maxy1, maxy);
4823         miny2 = bound(miny, thread->miny2, maxy);
4824         maxy2 = bound(miny, thread->maxy2, maxy);
4825         if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4826         {
4827                 if (!ATOMIC_DECREMENT(command->refcount))
4828                 {
4829                         if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4830                                 MM_FREE(command->arrays);
4831                 }
4832                 return;
4833         }
4834         minx = thread->fb_scissor[0];
4835         maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4836         fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4837         fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4838         viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4839         viewportscale = _mm_load_ps(thread->fb_viewportscale);
4840         screen[3] = _mm_setzero_ps();
4841         clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4842         for (i = 0;i < numtriangles;i++)
4843         {
4844                 const float *screencoord4f = command->arrays;
4845                 const float *arrays = screencoord4f + numvertices*4;
4846
4847                 // generate the 3 edges of this triangle
4848                 // generate spans for the triangle - switch based on left split or right split classification of triangle
4849                 if (element3s)
4850                 {
4851                         e[0] = element3s[i*3+0] - firstvertex;
4852                         e[1] = element3s[i*3+1] - firstvertex;
4853                         e[2] = element3s[i*3+2] - firstvertex;
4854                 }
4855                 else if (element3i)
4856                 {
4857                         e[0] = element3i[i*3+0] - firstvertex;
4858                         e[1] = element3i[i*3+1] - firstvertex;
4859                         e[2] = element3i[i*3+2] - firstvertex;
4860                 }
4861                 else
4862                 {
4863                         e[0] = i*3+0;
4864                         e[1] = i*3+1;
4865                         e[2] = i*3+2;
4866                 }
4867
4868 #define SKIPBACKFACE \
4869                 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4870                 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4871                 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4872                 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4873                                                                         _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4874                 switch(cullface) \
4875                 { \
4876                 case GL_BACK: \
4877                         if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4878                                 continue; \
4879                         break; \
4880                 case GL_FRONT: \
4881                         if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4882                                 continue; \
4883                         break; \
4884                 }
4885
4886 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4887                         clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4888                         { \
4889                                 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4890                                 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4891                         }
4892 #define CLIPPEDVERTEXCOPY(k,p1) \
4893                         screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4894
4895 #define GENATTRIBCOPY(attrib, p1) \
4896                 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4897 #define GENATTRIBLERP(attrib, p1, p2) \
4898                 { \
4899                         __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4900                         attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4901                 }
4902 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4903                 switch(clipcase) \
4904                 { \
4905                 default: \
4906                 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4907                 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4908                 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4909                 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4910                 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4911                 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4912                 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4913                 }
4914
4915                 if (! clipped)
4916                         goto notclipped;
4917
4918                 // calculate distance from nearplane
4919                 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4920                 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4921                 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4922                 if (clipdist[0] >= 0.0f)
4923                 {
4924                         if (clipdist[1] >= 0.0f)
4925                         {
4926                                 if (clipdist[2] >= 0.0f)
4927                                 {
4928                                 notclipped:
4929                                         // triangle is entirely in front of nearplane
4930                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4931                                         SKIPBACKFACE;
4932                                         numpoints = 3;
4933                                         clipcase = 0;
4934                                 }
4935                                 else
4936                                 {
4937                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4938                                         SKIPBACKFACE;
4939                                         numpoints = 4;
4940                                         clipcase = 1;
4941                                 }
4942                         }
4943                         else
4944                         {
4945                                 if (clipdist[2] >= 0.0f)
4946                                 {
4947                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4948                                         SKIPBACKFACE;
4949                                         numpoints = 4;
4950                                         clipcase = 2;
4951                                 }
4952                                 else
4953                                 {
4954                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4955                                         SKIPBACKFACE;
4956                                         numpoints = 3;
4957                                         clipcase = 3;
4958                                 }
4959                         }
4960                 }
4961                 else if (clipdist[1] >= 0.0f)
4962                 {
4963                         if (clipdist[2] >= 0.0f)
4964                         {
4965                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4966                                 SKIPBACKFACE;
4967                                 numpoints = 4;
4968                                 clipcase = 4;
4969                         }
4970                         else
4971                         {
4972                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4973                                 SKIPBACKFACE;
4974                                 numpoints = 3;
4975                                 clipcase = 5;
4976                         }
4977                 }
4978                 else if (clipdist[2] >= 0.0f)
4979                 {
4980                         CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4981                         SKIPBACKFACE;
4982                         numpoints = 3;
4983                         clipcase = 6;
4984                 }
4985                 else continue; // triangle is entirely behind nearplane
4986
4987                 {
4988                         // calculate integer y coords for triangle points
4989                         __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4990                                         screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4991                                         screenmin = _mm_min_epi16(screeni, screenir),
4992                                         screenmax = _mm_max_epi16(screeni, screenir);
4993                         screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4994                         screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4995                         screenmin = _mm_max_epi16(screenmin, fbmin);
4996                         screenmax = _mm_min_epi16(screenmax, fbmax);
4997                         // skip offscreen triangles
4998                         if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4999                                 continue;
5000                         starty = _mm_extract_epi16(screenmin, 1);
5001                         endy = _mm_extract_epi16(screenmax, 1)+1;
5002                         if (starty >= maxy1 && endy <= miny2)
5003                                 continue;
5004                         screeny = _mm_srai_epi32(screeni, 16);
5005                 }
5006
5007                 triangle = &thread->triangles[thread->numtriangles];
5008
5009                 // calculate attribute plans for triangle data...
5010                 // okay, this triangle is going to produce spans, we'd better project
5011                 // the interpolants now (this is what gives perspective texturing),
5012                 // this consists of simply multiplying all arrays by the W coord
5013                 // (which is basically 1/Z), which will be undone per-pixel
5014                 // (multiplying by Z again) to get the perspective-correct array
5015                 // values
5016                 {
5017                         __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
5018                         __m128 mipedgescale, mipdensity;
5019                         attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
5020                         attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
5021                         attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
5022                         attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
5023                         attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
5024                         w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
5025                         w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
5026                         w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
5027                         attribedge1 = _mm_sub_ss(w0, w1);
5028                         attribedge2 = _mm_sub_ss(w2, w1);
5029                         attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5030                         attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5031                         x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
5032                         y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
5033                         attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5034                         _mm_store_ss(&triangle->w[0], attribxslope);
5035                         _mm_store_ss(&triangle->w[1], attribyslope);
5036                         _mm_store_ss(&triangle->w[2], attriborigin);
5037                         
5038                         clip0origin = 0;
5039                         clip0slope = 0;
5040                         clip0dir = 0;
5041                         if(thread->clipplane[0] || thread->clipplane[1] || thread->clipplane[2])
5042                         {
5043                                 float cliporigin, clipxslope, clipyslope;
5044                                 attriborigin = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(2, 2, 2, 2));
5045                                 attribedge1 = _mm_sub_ss(_mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5046                                 attribedge2 = _mm_sub_ss(_mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5047                                 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5048                                 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5049                                 attriborigin = _mm_sub_ss(attriborigin, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5050                                 cliporigin = _mm_cvtss_f32(attriborigin)*thread->clipplane[2] + thread->clipplane[3];
5051                                 clipxslope = thread->clipplane[0] + _mm_cvtss_f32(attribxslope)*thread->clipplane[2];
5052                                 clipyslope = thread->clipplane[1] + _mm_cvtss_f32(attribyslope)*thread->clipplane[2];
5053                                 if(clipxslope != 0)
5054                                 {
5055                                         clip0origin = -cliporigin/clipxslope;
5056                                         clip0slope = -clipyslope/clipxslope;
5057                                         clip0dir = clipxslope > 0 ? 1 : -1;
5058                                 }
5059                                 else if(clipyslope > 0)
5060                                 {
5061                                         clip0origin = dpsoftrast.fb_width*floor(cliporigin/clipyslope);
5062                                         clip0slope = dpsoftrast.fb_width;
5063                                         clip0dir = -1;
5064                                 }
5065                                 else if(clipyslope < 0)
5066                                 {
5067                                         clip0origin = dpsoftrast.fb_width*ceil(cliporigin/clipyslope);
5068                                         clip0slope = -dpsoftrast.fb_width;
5069                                         clip0dir = -1;
5070                                 }
5071                                 else if(clip0origin < 0) continue;
5072                         }
5073
5074                         mipedgescale = _mm_setzero_ps();
5075                         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
5076                         {
5077                                 __m128 attrib0, attrib1, attrib2;
5078                                 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
5079                                 if (k >= DPSOFTRAST_ARRAY_TOTAL)
5080                                         break;
5081                                 arrays += numvertices*4;
5082                                 GENATTRIBS(attrib0, attrib1, attrib2);
5083                                 attriborigin = _mm_mul_ps(attrib1, w1);
5084                                 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
5085                                 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
5086                                 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
5087                                 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
5088                                 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
5089                                 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
5090                                 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
5091                                 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
5092                                 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
5093                                 {
5094                                         mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
5095                                         mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
5096                                         mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
5097                                         mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
5098                                 }
5099                         }
5100
5101                         memset(triangle->mip, 0, sizeof(triangle->mip));
5102                         for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
5103                         {
5104                                 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
5105                                 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
5106                                         break;
5107                                 texture = thread->texbound[texunit];
5108                                 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
5109                                 {
5110                                         mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
5111                                         mipdensity = _mm_mul_ps(mipdensity, mipdensity);
5112                                         mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
5113                                         mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
5114                                         // this will be multiplied in the texturing routine by the texture resolution
5115                                         y = _mm_cvtss_si32(mipdensity);
5116                                         if (y > 0)
5117                                         {
5118                                                 y = (int)(log((float)y)*0.5f/M_LN2);
5119                                                 if (y > texture->mipmaps - 1)
5120                                                         y = texture->mipmaps - 1;
5121                                                 triangle->mip[texunit] = y;
5122                                         }
5123                                 }
5124                         }
5125                 }
5126         
5127                 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
5128                 for (; y < bandy;)
5129                 {
5130                         __m128 xcoords, xslope;
5131                         __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
5132                         int yccmask = _mm_movemask_epi8(ycc);
5133                         int edge0p, edge0n, edge1p, edge1n;
5134                         int nexty;
5135                         float clip0;
5136                         if (numpoints == 4)
5137                         {
5138                                 switch(yccmask)
5139                                 {
5140                                 default:
5141                                 case 0xFFFF: /*0000*/ y = endy; continue;
5142                                 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
5143                                 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5144                                 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
5145                                 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
5146                                 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5147                                 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5148                                 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5149                                 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5150                                 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5151                                 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5152                                 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5153                                 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5154                                 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5155                                 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5156                                 case 0x0000: /*1111*/ y++; continue;
5157                                 }
5158                         }
5159                         else
5160                         {
5161                                 switch(yccmask)
5162                                 {
5163                                 default:
5164                                 case 0xFFFF: /*000*/ y = endy; continue;
5165                                 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5166                                 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5167                                 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5168                                 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5169                                 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5170                                 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5171                                 case 0x0000: /*111*/ y++; continue;
5172                                 }
5173                         }
5174                         ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5175                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5176                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5177                         nexty = _mm_extract_epi16(ycc, 0);
5178                         if (nexty >= bandy) nexty = bandy-1;
5179                         xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5180                         xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5181                         xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5182                                                                 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5183                         xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5184                         if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5185                         {
5186                                 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5187                                 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5188                         }
5189                         clip0 = clip0origin + (y+0.5f)*clip0slope;
5190                         for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope), clip0 += clip0slope)
5191                         {
5192                                 int startx, endx, clipx = minx, offset;
5193                                 startx = _mm_cvtss_si32(xcoords);
5194                                 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5195                                 if (startx < minx) 
5196                                 {
5197                                         if (startx < 0) startx = 0;
5198                                         startx += (minx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
5199                                 }
5200                                 if (endx > maxx) endx = maxx;
5201                                 if (startx >= endx) continue;
5202
5203                                 if (clip0dir)
5204                                 {
5205                                         if (clip0dir > 0)
5206                                         {
5207                                                 if (startx < clip0) 
5208                                                 {
5209                                                         if(endx <= clip0) continue;
5210                                                         clipx = max((int)clip0, minx);
5211                                                         startx += (clipx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1); 
5212                                                 }
5213                                         }
5214                                         else if (endx > clip0) 
5215                                         {
5216                                                 if(startx >= clip0) continue;
5217                                                 endx = (int)clip0;
5218                                         }
5219                                 }
5220                                                 
5221                                 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5222                                 {
5223                                         DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5224                                         span->triangle = thread->numtriangles;
5225                                         span->x = offset;
5226                                         span->y = y;
5227                                         span->startx = max(clipx - offset, 0);
5228                                         span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5229                                         if (span->startx >= span->endx)
5230                                                 continue; 
5231                                         if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5232                                                 DPSOFTRAST_Draw_ProcessSpans(thread);
5233                                 }
5234                         }
5235                 }
5236
5237                 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5238                 {
5239                         DPSOFTRAST_Draw_ProcessSpans(thread);
5240                         thread->numtriangles = 0;
5241                 }
5242         }
5243
5244         if (!ATOMIC_DECREMENT(command->refcount))
5245         {
5246                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5247                         MM_FREE(command->arrays);
5248         }
5249
5250         if (thread->numspans > 0 || thread->numtriangles > 0)
5251         {
5252                 DPSOFTRAST_Draw_ProcessSpans(thread);
5253                 thread->numtriangles = 0;
5254         }
5255 #endif
5256 }
5257
5258 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5259 {
5260         int i;
5261         int j;
5262         int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5263         int datasize = 2*numvertices*sizeof(float[4]);
5264         DPSOFTRAST_Command_Draw *command;
5265         unsigned char *data;
5266         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5267         {
5268                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5269                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5270                         break;
5271                 datasize += numvertices*sizeof(float[4]);
5272         }
5273         if (element3s)
5274                 datasize += numtriangles*sizeof(unsigned short[3]);
5275         else if (element3i)
5276                 datasize += numtriangles*sizeof(int[3]);
5277         datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5278         if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5279         {
5280                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5281                 data = (unsigned char *)MM_CALLOC(datasize, 1);
5282         }
5283         else
5284         {
5285                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5286                 data = (unsigned char *)command + commandsize;
5287         }
5288         command->firstvertex = firstvertex;
5289         command->numvertices = numvertices;
5290         command->numtriangles = numtriangles;
5291         command->arrays = (float *)data;
5292         memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5293         dpsoftrast.firstvertex = firstvertex;
5294         dpsoftrast.numvertices = numvertices;
5295         dpsoftrast.screencoord4f = (float *)data;
5296         data += numvertices*sizeof(float[4]);
5297         dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5298         data += numvertices*sizeof(float[4]);
5299         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5300         {
5301                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5302                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5303                         break;
5304                 dpsoftrast.post_array4f[j] = (float *)data;
5305                 data += numvertices*sizeof(float[4]);
5306         }
5307         command->element3i = NULL;
5308         command->element3s = NULL;
5309         if (element3s)
5310         {
5311                 command->element3s = (unsigned short *)data;
5312                 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5313         }
5314         else if (element3i)
5315         {
5316                 command->element3i = (int *)data;
5317                 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5318         }
5319         return command;
5320 }
5321
5322 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5323 {
5324         DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5325         DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5326         command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5327         command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5328         if (command->starty >= command->endy)
5329         {
5330                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5331                         MM_FREE(command->arrays);
5332                 DPSOFTRAST_UndoCommand(command->commandsize);
5333                 return;
5334         }
5335         command->clipped = dpsoftrast.drawclipped;
5336         command->refcount = dpsoftrast.numthreads;
5337
5338         if (dpsoftrast.usethreads)
5339         {
5340                 int i;
5341                 DPSOFTRAST_Draw_SyncCommands();
5342                 for (i = 0; i < dpsoftrast.numthreads; i++)
5343                 {
5344                         DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5345                         if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5346                                 Thread_CondSignal(thread->drawcond);
5347                 }
5348         }
5349         else
5350         {
5351                 DPSOFTRAST_Draw_FlushThreads();
5352         }
5353 }
5354
5355 DEFCOMMAND(23, SetRenderTargets, int width; int height;);
5356 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5357 {
5358         thread->validate |= DPSOFTRAST_VALIDATE_FB;
5359 }
5360 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5361 {
5362         DPSOFTRAST_Command_SetRenderTargets *command;
5363         if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5364                 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5365                 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5366                 DPSOFTRAST_Flush();
5367         dpsoftrast.fb_width = width;
5368         dpsoftrast.fb_height = height;
5369         dpsoftrast.fb_depthpixels = depthpixels;
5370         dpsoftrast.fb_colorpixels[0] = colorpixels0;
5371         dpsoftrast.fb_colorpixels[1] = colorpixels1;
5372         dpsoftrast.fb_colorpixels[2] = colorpixels2;
5373         dpsoftrast.fb_colorpixels[3] = colorpixels3;
5374         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5375         command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5376         command->width = width;
5377         command->height = height;
5378 }
5379  
5380 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5381 {
5382         int commandoffset = thread->commandoffset;
5383         while (commandoffset != endoffset)
5384         {
5385                 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5386                 switch (command->opcode)
5387                 {
5388 #define INTERPCOMMAND(name) \
5389                 case DPSOFTRAST_OPCODE_##name : \
5390                         DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5391                         commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5392                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5393                                 commandoffset = 0; \
5394                         break;
5395                 INTERPCOMMAND(Viewport)
5396                 INTERPCOMMAND(ClearColor)
5397                 INTERPCOMMAND(ClearDepth)
5398                 INTERPCOMMAND(ColorMask)
5399                 INTERPCOMMAND(DepthTest)
5400                 INTERPCOMMAND(ScissorTest)
5401                 INTERPCOMMAND(Scissor)
5402                 INTERPCOMMAND(BlendFunc)
5403                 INTERPCOMMAND(BlendSubtract)
5404                 INTERPCOMMAND(DepthMask)
5405                 INTERPCOMMAND(DepthFunc)
5406                 INTERPCOMMAND(DepthRange)
5407                 INTERPCOMMAND(PolygonOffset)
5408                 INTERPCOMMAND(CullFace)
5409                 INTERPCOMMAND(AlphaTest)
5410                 INTERPCOMMAND(AlphaFunc)
5411                 INTERPCOMMAND(SetTexture)
5412                 INTERPCOMMAND(SetShader)
5413                 INTERPCOMMAND(Uniform4f)
5414                 INTERPCOMMAND(UniformMatrix4f)
5415                 INTERPCOMMAND(Uniform1i)
5416                 INTERPCOMMAND(SetRenderTargets)
5417                 INTERPCOMMAND(ClipPlane)
5418
5419                 case DPSOFTRAST_OPCODE_Draw:
5420                         DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5421                         commandoffset += command->commandsize;
5422                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5423                                 commandoffset = 0;
5424                         thread->commandoffset = commandoffset;
5425                         break;
5426
5427                 case DPSOFTRAST_OPCODE_Reset:
5428                         commandoffset = 0;
5429                         break;
5430                 }
5431         }
5432         thread->commandoffset = commandoffset;
5433 }
5434
5435 static int DPSOFTRAST_Draw_Thread(void *data)
5436 {
5437         DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5438         while(thread->index >= 0)
5439         {
5440                 if (thread->commandoffset != dpsoftrast.drawcommand)
5441                 {
5442                         DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);      
5443                 }
5444                 else 
5445                 {
5446                         Thread_LockMutex(thread->drawmutex);
5447                         if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5448                         {
5449                                 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5450                                 thread->starving = true;
5451                                 Thread_CondWait(thread->drawcond, thread->drawmutex);
5452                                 thread->starving = false;
5453                         }
5454                         Thread_UnlockMutex(thread->drawmutex);
5455                 }
5456         }   
5457         return 0;
5458 }
5459
5460 static void DPSOFTRAST_Draw_FlushThreads(void)
5461 {
5462         DPSOFTRAST_State_Thread *thread;
5463         int i;
5464         DPSOFTRAST_Draw_SyncCommands();
5465         if (dpsoftrast.usethreads) 
5466         {
5467                 for (i = 0; i < dpsoftrast.numthreads; i++)
5468                 {
5469                         thread = &dpsoftrast.threads[i];
5470                         if (thread->commandoffset != dpsoftrast.drawcommand)
5471                         {
5472                                 Thread_LockMutex(thread->drawmutex);
5473                                 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5474                                         Thread_CondSignal(thread->drawcond);
5475                                 Thread_UnlockMutex(thread->drawmutex);
5476                         }
5477                 }
5478                 for (i = 0; i < dpsoftrast.numthreads; i++)
5479                 {
5480                         thread = &dpsoftrast.threads[i];
5481                         if (thread->commandoffset != dpsoftrast.drawcommand)
5482                         {
5483                                 Thread_LockMutex(thread->drawmutex);
5484                                 if (thread->commandoffset != dpsoftrast.drawcommand)
5485                                 {
5486                                         thread->waiting = true;
5487                                         Thread_CondWait(thread->waitcond, thread->drawmutex);
5488                                         thread->waiting = false;
5489                                 }
5490                                 Thread_UnlockMutex(thread->drawmutex);
5491                         }
5492                 }
5493         }
5494         else
5495         {
5496                 for (i = 0; i < dpsoftrast.numthreads; i++)
5497                 {
5498                         thread = &dpsoftrast.threads[i];
5499                         if (thread->commandoffset != dpsoftrast.drawcommand)
5500                                 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5501                 }
5502         }
5503         dpsoftrast.commandpool.usedcommands = 0;
5504 }
5505
5506 void DPSOFTRAST_Flush(void)
5507 {
5508         DPSOFTRAST_Draw_FlushThreads();
5509 }
5510
5511 void DPSOFTRAST_Finish(void)
5512 {
5513         DPSOFTRAST_Flush();
5514 }
5515
5516 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5517 {
5518         int i;
5519         union
5520         {
5521                 int i;
5522                 unsigned char b[4];
5523         }
5524         u;
5525         u.i = 1;
5526         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5527         dpsoftrast.bigendian = u.b[3];
5528         dpsoftrast.fb_width = width;
5529         dpsoftrast.fb_height = height;
5530         dpsoftrast.fb_depthpixels = depthpixels;
5531         dpsoftrast.fb_colorpixels[0] = colorpixels;
5532         dpsoftrast.fb_colorpixels[1] = NULL;
5533         dpsoftrast.fb_colorpixels[1] = NULL;
5534         dpsoftrast.fb_colorpixels[1] = NULL;
5535         dpsoftrast.viewport[0] = 0;
5536         dpsoftrast.viewport[1] = 0;
5537         dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5538         dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5539         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5540         dpsoftrast.texture_firstfree = 1;
5541         dpsoftrast.texture_end = 1;
5542         dpsoftrast.texture_max = 0;
5543         dpsoftrast.color[0] = 1;
5544         dpsoftrast.color[1] = 1;
5545         dpsoftrast.color[2] = 1;
5546         dpsoftrast.color[3] = 1;
5547         dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5548         dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5549         dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5550         dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5551         for (i = 0; i < dpsoftrast.numthreads; i++)
5552         {
5553                 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5554                 thread->index = i;
5555                 thread->cullface = GL_BACK;
5556                 thread->colormask[1] = 1;
5557                 thread->colormask[2] = 1;
5558                 thread->colormask[3] = 1;
5559                 thread->blendfunc[0] = GL_ONE;
5560                 thread->blendfunc[1] = GL_ZERO;
5561                 thread->depthmask = true;
5562                 thread->depthtest = true;
5563                 thread->depthfunc = GL_LEQUAL;
5564                 thread->scissortest = false;
5565                 thread->alphatest = false;
5566                 thread->alphafunc = GL_GREATER;
5567                 thread->alphavalue = 0.5f;
5568                 thread->viewport[0] = 0;
5569                 thread->viewport[1] = 0;
5570                 thread->viewport[2] = dpsoftrast.fb_width;
5571                 thread->viewport[3] = dpsoftrast.fb_height;
5572                 thread->scissor[0] = 0;
5573                 thread->scissor[1] = 0;
5574                 thread->scissor[2] = dpsoftrast.fb_width;
5575                 thread->scissor[3] = dpsoftrast.fb_height;
5576                 thread->depthrange[0] = 0;
5577                 thread->depthrange[1] = 1;
5578                 thread->polygonoffset[0] = 0;
5579                 thread->polygonoffset[1] = 0;
5580                 thread->clipplane[0] = 0;
5581                 thread->clipplane[1] = 0;
5582                 thread->clipplane[2] = 0;
5583                 thread->clipplane[3] = 1;
5584         
5585                 DPSOFTRAST_RecalcThread(thread);
5586         
5587                 thread->numspans = 0;
5588                 thread->numtriangles = 0;
5589                 thread->commandoffset = 0;
5590                 thread->waiting = false;
5591                 thread->starving = false;
5592            
5593                 thread->validate = -1;
5594                 DPSOFTRAST_Validate(thread, -1);
5595  
5596                 if (dpsoftrast.usethreads)
5597                 {
5598                         thread->waitcond = Thread_CreateCond();
5599                         thread->drawcond = Thread_CreateCond();
5600                         thread->drawmutex = Thread_CreateMutex();
5601                         thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5602                 }
5603         }
5604         return 0;
5605 }
5606
5607 void DPSOFTRAST_Shutdown(void)
5608 {
5609         int i;
5610         if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5611         {
5612                 DPSOFTRAST_State_Thread *thread;
5613                 for (i = 0; i < dpsoftrast.numthreads; i++)
5614                 {
5615                         thread = &dpsoftrast.threads[i];
5616                         Thread_LockMutex(thread->drawmutex);
5617                         thread->index = -1;
5618                         Thread_CondSignal(thread->drawcond);
5619                         Thread_UnlockMutex(thread->drawmutex);
5620                         Thread_WaitThread(thread->thread, 0);
5621                         Thread_DestroyCond(thread->waitcond);
5622                         Thread_DestroyCond(thread->drawcond);
5623                         Thread_DestroyMutex(thread->drawmutex);
5624                 }
5625         }
5626         for (i = 0;i < dpsoftrast.texture_end;i++)
5627                 if (dpsoftrast.texture[i].bytes)
5628                         MM_FREE(dpsoftrast.texture[i].bytes);
5629         if (dpsoftrast.texture)
5630                 free(dpsoftrast.texture);
5631         if (dpsoftrast.threads)
5632                 MM_FREE(dpsoftrast.threads);
5633         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5634 }
5635