]> git.xonotic.org Git - xonotic/darkplaces.git/blob - dpsoftrast.c
optimized MultiplyVaryingBGRA8 and VaryingBGRA8
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "dpsoftrast.h"
7
8 #ifdef USE_SDL
9 //#define USE_THREADS
10 #endif
11
12 #ifdef USE_THREADS
13 #include <SDL.h>
14 #include <SDL_thread.h>
15 #endif
16
17 #ifndef __cplusplus
18 typedef qboolean bool;
19 #endif
20
21 #define ALIGN_SIZE 16
22 #define ATOMIC_SIZE 32
23
24 #ifdef SSE2_PRESENT
25         #if defined(__GNUC__)
26                 #define ALIGN(var) var __attribute__((__aligned__(16)))
27                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
28                 #ifdef USE_THREADS
29                         #define MEMORY_BARRIER (_mm_sfence())
30                         //(__sync_synchronize())
31                         #define ATOMIC_COUNTER volatile int
32                         #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
33                         #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
34                         #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
35                 #endif
36         #elif defined(_MSC_VER)
37                 #define ALIGN(var) __declspec(align(16)) var
38                 #define ATOMIC(var) __declspec(align(32)) var
39                 #ifdef USE_THREADS
40                         #define MEMORY_BARRIER (_mm_sfence())
41                         //(MemoryBarrier())
42                         #define ATOMIC_COUNTER volatile LONG
43                         #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
44                         #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
45                         #define ATOMIC_ADD(counter, val) (InterlockedExchangeAdd(&(counter), (val)))
46                 #endif
47         #else
48                 #undef USE_THREADS
49                 #undef SSE2_PRESENT
50         #endif
51 #endif
52
53 #ifndef SSE2_PRESENT
54         #define ALIGN(var) var
55         #define ATOMIC(var) var
56 #endif
57
58 #ifndef USE_THREADS
59         #define MEMORY_BARRIER ((void)0)
60         #define ATOMIC_COUNTER int
61         #define ATOMIC_INCREMENT(counter) (++(counter))
62         #define ATOMIC_DECREMENT(counter) (--(counter))
63         #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
64 #endif
65
66 #ifdef SSE2_PRESENT
67 #include <emmintrin.h>
68
69 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
70
71 static void *MM_CALLOC(size_t nmemb, size_t size)
72 {
73         void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
74         if(ptr != NULL) memset(ptr, 0, nmemb*size);
75         return ptr;
76 }
77
78 #define MM_FREE _mm_free
79 #else
80 #define MM_MALLOC(size) malloc(size)
81 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
82 #define MM_FREE free
83 #endif
84
85 typedef enum DPSOFTRAST_ARRAY_e
86 {
87         DPSOFTRAST_ARRAY_POSITION,
88         DPSOFTRAST_ARRAY_COLOR,
89         DPSOFTRAST_ARRAY_TEXCOORD0,
90         DPSOFTRAST_ARRAY_TEXCOORD1,
91         DPSOFTRAST_ARRAY_TEXCOORD2,
92         DPSOFTRAST_ARRAY_TEXCOORD3,
93         DPSOFTRAST_ARRAY_TEXCOORD4,
94         DPSOFTRAST_ARRAY_TEXCOORD5,
95         DPSOFTRAST_ARRAY_TEXCOORD6,
96         DPSOFTRAST_ARRAY_TEXCOORD7,
97         DPSOFTRAST_ARRAY_TOTAL
98 }
99 DPSOFTRAST_ARRAY;
100
101 typedef struct DPSOFTRAST_Texture_s
102 {
103         int flags;
104         int width;
105         int height;
106         int depth;
107         int sides;
108         DPSOFTRAST_TEXTURE_FILTER filter;
109         int mipmaps;
110         int size;
111         ATOMIC_COUNTER binds;
112         unsigned char *bytes;
113         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
114 }
115 DPSOFTRAST_Texture;
116
117 #define COMMAND_SIZE ALIGN_SIZE
118 #define COMMAND_ALIGN(var) ALIGN(var)
119
120 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
121 {
122         unsigned char opcode;
123         unsigned short commandsize;
124 }
125 DPSOFTRAST_Command);
126
127 enum { DPSOFTRAST_OPCODE_Reset = 0 };
128
129 #define DEFCOMMAND(opcodeval, name, fields) \
130         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
131         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
132         { \
133                 unsigned char opcode; \
134                 unsigned short commandsize; \
135                 fields \
136         } DPSOFTRAST_Command_##name );
137
138 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
139 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
140
141 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
142 {
143         int freecommand;
144         int usedcommands;
145         ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
146 }
147 DPSOFTRAST_State_Command_Pool);
148
149 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
150 {
151         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
152         float w[3];
153         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
154 }
155 DPSOFTRAST_State_Triangle);
156
157 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
158         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
159         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
160                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
161                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
162 }
163 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
164         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
165         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
166         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
167         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
168         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
169         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
170         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
171         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
172 }
173                                         
174 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
175
176 typedef ALIGN(struct DPSOFTRAST_State_Span_s
177 {
178         int triangle; // triangle this span was generated by
179         int x; // framebuffer x coord
180         int y; // framebuffer y coord
181         int length; // pixel count
182         int startx; // usable range (according to pixelmask)
183         int endx; // usable range (according to pixelmask)
184         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
185 }
186 DPSOFTRAST_State_Span);
187
188 #define DPSOFTRAST_DRAW_MAXSPANS 1024
189 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
190
191 #define DPSOFTRAST_VALIDATE_FB 1
192 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
193 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
194 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
195
196 typedef enum DPSOFTRAST_BLENDMODE_e
197 {
198         DPSOFTRAST_BLENDMODE_OPAQUE,
199         DPSOFTRAST_BLENDMODE_ALPHA,
200         DPSOFTRAST_BLENDMODE_ADDALPHA,
201         DPSOFTRAST_BLENDMODE_ADD,
202         DPSOFTRAST_BLENDMODE_INVMOD,
203         DPSOFTRAST_BLENDMODE_MUL,
204         DPSOFTRAST_BLENDMODE_MUL2,
205         DPSOFTRAST_BLENDMODE_SUBALPHA,
206         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
207         DPSOFTRAST_BLENDMODE_TOTAL
208 }
209 DPSOFTRAST_BLENDMODE;
210
211 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
212 {
213 #ifdef USE_THREADS
214         SDL_Thread *thread;
215 #endif
216         int index;
217         
218         int cullface;
219         int colormask[4];
220         int blendfunc[2];
221         int blendsubtract;
222         int depthmask;
223         int depthtest;
224         int depthfunc;
225         int scissortest;
226         int alphatest;
227         int alphafunc;
228         float alphavalue;
229         int viewport[4];
230         int scissor[4];
231         float depthrange[2];
232         float polygonoffset[2];
233
234         int shader_mode;
235         int shader_permutation;
236
237         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
238         
239         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
240         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
241
242         // DPSOFTRAST_VALIDATE_ flags
243         int validate;
244
245         // derived values (DPSOFTRAST_VALIDATE_FB)
246         int fb_colormask;
247         int fb_clearscissor[4];
248         ALIGN(float fb_viewportcenter[4]);
249         ALIGN(float fb_viewportscale[4]);
250
251         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
252         int fb_depthfunc;
253
254         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
255         int fb_blendmode;
256
257         ATOMIC(volatile int commandoffset);
258
259         volatile bool waiting;
260         volatile bool starving;
261 #ifdef USE_THREADS
262         SDL_cond *waitcond;
263         SDL_cond *drawcond;
264         SDL_mutex *drawmutex;
265 #endif
266
267         int numspans;
268         int numtriangles;
269         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
270         DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
271 }
272 DPSOFTRAST_State_Thread);
273
274 typedef ATOMIC(struct DPSOFTRAST_State_s
275 {
276         int fb_width;
277         int fb_height;
278         unsigned int *fb_depthpixels;
279         unsigned int *fb_colorpixels[4];
280
281         int viewport[4];
282         ALIGN(float fb_viewportcenter[4]);
283         ALIGN(float fb_viewportscale[4]);
284
285         float color[4];
286         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
287         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
288
289         const float *pointer_vertex3f;
290         const float *pointer_color4f;
291         const unsigned char *pointer_color4ub;
292         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
293         int stride_vertex;
294         int stride_color;
295         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
296         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
297         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
298
299         int firstvertex;
300         int numvertices;
301         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
302         float *screencoord4f;
303         int drawstarty;
304         int drawendy;
305         int drawclipped;
306         
307         int shader_mode;
308         int shader_permutation;
309
310         int texture_max;
311         int texture_end;
312         int texture_firstfree;
313         DPSOFTRAST_Texture *texture;
314
315         int bigendian;
316
317         // error reporting
318         const char *errorstring;
319
320         int numthreads;
321         DPSOFTRAST_State_Thread *threads;
322
323         ATOMIC(volatile int drawcommand);
324
325         DPSOFTRAST_State_Command_Pool commandpool;
326 }
327 DPSOFTRAST_State);
328
329 DPSOFTRAST_State dpsoftrast;
330
331 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
332 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
333 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
334 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
335 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
336
337 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
338 {
339         fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
340         fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
341         fb_viewportcenter[3] = 0.5f;
342         fb_viewportcenter[0] = 0.0f;
343         fb_viewportscale[1] = 0.5f * viewport[2];
344         fb_viewportscale[2] = -0.5f * viewport[3];
345         fb_viewportscale[3] = 0.5f;
346         fb_viewportscale[0] = 1.0f;
347 }
348
349 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
350 {
351         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
352         // and viewport projection values
353         int x1, x2;
354         int y1, y2;
355         x1 = thread->scissor[0];
356         x2 = thread->scissor[0] + thread->scissor[2];
357         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
358         y2 = dpsoftrast.fb_height - thread->scissor[1];
359         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
360         if (x1 < 0) x1 = 0;
361         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
362         if (y1 < 0) y1 = 0;
363         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
364         thread->fb_clearscissor[0] = x1;
365         thread->fb_clearscissor[1] = y1;
366         thread->fb_clearscissor[2] = x2 - x1;
367         thread->fb_clearscissor[3] = y2 - y1;
368
369         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
370 }
371
372 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
373 {
374         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
375 }
376
377 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
378 {
379         if (thread->blendsubtract)
380         {
381                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
382                 {
383                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
384                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
385                 BLENDFUNC(GL_SRC_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
386                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
387                 }
388         }
389         else
390         {       
391                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
392                 {
393                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
394                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
395                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
396                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
397                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
398                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
399                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
400                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
401                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
402                 BLENDFUNC(GL_SRC_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
403                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
404                 }
405         }
406 }
407
408 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
409
410 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
411 {
412         mask &= thread->validate;
413         if (!mask)
414                 return;
415         if (mask & DPSOFTRAST_VALIDATE_FB)
416         {
417                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
418                 DPSOFTRAST_RecalcFB(thread);
419         }
420         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
421         {
422                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
423                 DPSOFTRAST_RecalcDepthFunc(thread);
424         }
425         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
426         {
427                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
428                 DPSOFTRAST_RecalcBlendFunc(thread);
429         }
430 }
431
432 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
433 {
434         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
435                 return &dpsoftrast.texture[index];
436         return NULL;
437 }
438
439 static void DPSOFTRAST_Texture_Grow(void)
440 {
441         DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
442         DPSOFTRAST_State_Thread *thread;
443         int i;
444         int j;
445         DPSOFTRAST_Flush();
446         // expand texture array as needed
447         if (dpsoftrast.texture_max < 1024)
448                 dpsoftrast.texture_max = 1024;
449         else
450                 dpsoftrast.texture_max *= 2;
451         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
452         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
453                 if(dpsoftrast.texbound[i])
454                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
455         for (j = 0; j < dpsoftrast.numthreads; j++)
456         {
457                 thread = &dpsoftrast.threads[j];
458                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
459                         if(thread->texbound[i])
460                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
461         }
462 }
463
464 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
465 {
466         int w;
467         int h;
468         int d;
469         int size;
470         int s;
471         int texnum;
472         int mipmaps;
473         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
474         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
475         DPSOFTRAST_Texture *texture;
476         if (width*height*depth < 1)
477         {
478                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
479                 return 0;
480         }
481         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
482         {
483                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
484                 return 0;
485         }
486         switch(texformat)
487         {
488         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
489         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
490         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
491                 break;
492         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
493                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
494                 {
495                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
496                         return 0;
497                 }
498                 if (depth != 1)
499                 {
500                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
501                         return 0;
502                 }
503                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
504                 {
505                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
506                         return 0;
507                 }
508                 break;
509         }
510         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
511         {
512                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
513                 return 0;
514         }
515         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
516         {
517                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
518                 return 0;
519         }
520         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
521         {
522                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
523                 return 0;
524         }
525         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
526         {
527                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
528                 return 0;
529         }
530         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
531         {
532                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
533                 return 0;
534         }
535         // find first empty slot in texture array
536         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
537                 if (!dpsoftrast.texture[texnum].bytes)
538                         break;
539         dpsoftrast.texture_firstfree = texnum + 1;
540         if (dpsoftrast.texture_max <= texnum)
541                 DPSOFTRAST_Texture_Grow();
542         if (dpsoftrast.texture_end <= texnum)
543                 dpsoftrast.texture_end = texnum + 1;
544         texture = &dpsoftrast.texture[texnum];
545         memset(texture, 0, sizeof(*texture));
546         texture->flags = flags;
547         texture->width = width;
548         texture->height = height;
549         texture->depth = depth;
550         texture->sides = sides;
551         texture->binds = 0;
552         w = width;
553         h = height;
554         d = depth;
555         size = 0;
556         mipmaps = 0;
557         w = width;
558         h = height;
559         d = depth;
560         for (;;)
561         {
562                 s = w * h * d * sides * 4;
563                 texture->mipmap[mipmaps][0] = size;
564                 texture->mipmap[mipmaps][1] = s;
565                 texture->mipmap[mipmaps][2] = w;
566                 texture->mipmap[mipmaps][3] = h;
567                 texture->mipmap[mipmaps][4] = d;
568                 size += s;
569                 mipmaps++;
570                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
571                         break;
572                 if (w > 1) w >>= 1;
573                 if (h > 1) h >>= 1;
574                 if (d > 1) d >>= 1;
575         }
576         texture->mipmaps = mipmaps;
577         texture->size = size;
578
579         // allocate the pixels now
580         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
581
582         return texnum;
583 }
584 void DPSOFTRAST_Texture_Free(int index)
585 {
586         DPSOFTRAST_Texture *texture;
587         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
588         if (texture->binds)
589                 DPSOFTRAST_Flush();
590         if (texture->bytes)
591                 MM_FREE(texture->bytes);
592         texture->bytes = NULL;
593         memset(texture, 0, sizeof(*texture));
594         // adjust the free range and used range
595         if (dpsoftrast.texture_firstfree > index)
596                 dpsoftrast.texture_firstfree = index;
597         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
598                 dpsoftrast.texture_end--;
599 }
600 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
601 {
602         int i, x, y, z, w, layer0, layer1, row0, row1;
603         unsigned char *o, *i0, *i1, *i2, *i3;
604         DPSOFTRAST_Texture *texture;
605         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
606         if (texture->mipmaps <= 1)
607                 return;
608         for (i = 1;i < texture->mipmaps;i++)
609         {
610                 for (z = 0;z < texture->mipmap[i][4];z++)
611                 {
612                         layer0 = z*2;
613                         layer1 = z*2+1;
614                         if (layer1 >= texture->mipmap[i-1][4])
615                                 layer1 = texture->mipmap[i-1][4]-1;
616                         for (y = 0;y < texture->mipmap[i][3];y++)
617                         {
618                                 row0 = y*2;
619                                 row1 = y*2+1;
620                                 if (row1 >= texture->mipmap[i-1][3])
621                                         row1 = texture->mipmap[i-1][3]-1;
622                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
623                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
624                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
625                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
626                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
627                                 w = texture->mipmap[i][2];
628                                 if (layer1 > layer0)
629                                 {
630                                         if (texture->mipmap[i-1][2] > 1)
631                                         {
632                                                 // average 3D texture
633                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
634                                                 {
635                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
636                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
637                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
638                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
639                                                 }
640                                         }
641                                         else
642                                         {
643                                                 // average 3D mipmap with parent width == 1
644                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
645                                                 {
646                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
647                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
648                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
649                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
650                                                 }
651                                         }
652                                 }
653                                 else
654                                 {
655                                         if (texture->mipmap[i-1][2] > 1)
656                                         {
657                                                 // average 2D texture (common case)
658                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
659                                                 {
660                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
661                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
662                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
663                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
664                                                 }
665                                         }
666                                         else
667                                         {
668                                                 // 2D texture with parent width == 1
669                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
670                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
671                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
672                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
673                                         }
674                                 }
675                         }
676                 }
677         }
678 }
679 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
680 {
681         DPSOFTRAST_Texture *texture;
682         unsigned char *dst;
683         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
684         if (texture->binds)
685                 DPSOFTRAST_Flush();
686         dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
687         while (blockheight > 0)
688         {
689                 memcpy(dst, pixels, blockwidth * 4);
690                 pixels += blockwidth * 4;
691                 dst += texture->mipmap[0][2] * 4;
692                 blockheight--;
693         }
694         DPSOFTRAST_Texture_CalculateMipmaps(index);
695 }
696 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
697 {
698         DPSOFTRAST_Texture *texture;
699         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
700         if (texture->binds)
701                 DPSOFTRAST_Flush();
702         memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
703         DPSOFTRAST_Texture_CalculateMipmaps(index);
704 }
705 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
706 {
707         DPSOFTRAST_Texture *texture;
708         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
709         return texture->mipmap[mip][2];
710 }
711 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
712 {
713         DPSOFTRAST_Texture *texture;
714         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
715         return texture->mipmap[mip][3];
716 }
717 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
718 {
719         DPSOFTRAST_Texture *texture;
720         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
721         return texture->mipmap[mip][4];
722 }
723 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
724 {
725         DPSOFTRAST_Texture *texture;
726         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
727         if (texture->binds)
728                 DPSOFTRAST_Flush();
729         return texture->bytes + texture->mipmap[mip][0];
730 }
731 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
732 {
733         DPSOFTRAST_Texture *texture;
734         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
735         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
736         {
737                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
738                 return;
739         }
740         if (texture->binds)
741                 DPSOFTRAST_Flush();
742         texture->filter = filter;
743 }
744
745 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
746 {
747         if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
748                 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
749                 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
750                 DPSOFTRAST_Flush();
751         dpsoftrast.fb_width = width;
752         dpsoftrast.fb_height = height;
753         dpsoftrast.fb_depthpixels = depthpixels;
754         dpsoftrast.fb_colorpixels[0] = colorpixels0;
755         dpsoftrast.fb_colorpixels[1] = colorpixels1;
756         dpsoftrast.fb_colorpixels[2] = colorpixels2;
757         dpsoftrast.fb_colorpixels[3] = colorpixels3;
758 }
759
760 static void DPSOFTRAST_Draw_FlushThreads(void);
761
762 static void DPSOFTRAST_Draw_SyncCommands(void)
763 {
764         MEMORY_BARRIER;
765         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
766 }
767
768 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
769 {
770 #ifdef USE_THREADS
771         DPSOFTRAST_State_Thread *thread;
772         int i;
773         int freecommand = dpsoftrast.commandpool.freecommand;
774         int usedcommands = dpsoftrast.commandpool.usedcommands;
775         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
776                 return;
777         DPSOFTRAST_Draw_SyncCommands();
778         for(;;)
779         {
780                 int waitindex = -1;
781                 int commandoffset;
782                 usedcommands = 0;
783                 for (i = 0; i < dpsoftrast.numthreads; i++)
784                 {
785                         thread = &dpsoftrast.threads[i]; 
786                         commandoffset = freecommand - thread->commandoffset;
787                         if (commandoffset < 0)
788                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
789                         if (commandoffset > usedcommands)
790                         {
791                                 waitindex = i;
792                                 usedcommands = commandoffset;
793                         }
794                 }
795                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
796                         break;
797                 thread = &dpsoftrast.threads[waitindex];
798                 SDL_LockMutex(thread->drawmutex);
799                 if (thread->commandoffset != dpsoftrast.drawcommand)
800                 {
801                         thread->waiting = true;
802                         if (thread->starving) SDL_CondSignal(thread->drawcond);
803                         SDL_CondWait(thread->waitcond, thread->drawmutex);
804                         thread->waiting = false;
805                 }
806                 SDL_UnlockMutex(thread->drawmutex);
807         }
808         dpsoftrast.commandpool.usedcommands = usedcommands;
809 #else
810         DPSOFTRAST_Draw_FlushThreads();
811 #endif
812 }
813
814 #define DPSOFTRAST_ALIGNCOMMAND(size) \
815         ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
816 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
817         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
818
819 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
820 {
821         DPSOFTRAST_Command *command;
822         int freecommand = dpsoftrast.commandpool.freecommand;
823         int usedcommands = dpsoftrast.commandpool.usedcommands;
824         int extra = sizeof(DPSOFTRAST_Command);
825         if(DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
826                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
827         if(usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
828         {
829                 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
830                 freecommand = dpsoftrast.commandpool.freecommand;
831                 usedcommands = dpsoftrast.commandpool.usedcommands;
832         }
833         if(DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
834         {
835                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
836                 command->opcode = DPSOFTRAST_OPCODE_Reset;
837                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
838                 freecommand = 0;
839         }
840         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
841         command->opcode = opcode;
842         command->commandsize = size;
843         freecommand += size;
844         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
845                 freecommand = 0;
846         dpsoftrast.commandpool.freecommand = freecommand;
847         dpsoftrast.commandpool.usedcommands = usedcommands + size;
848         return command;
849 }
850
851 static void DPSOFTRAST_UndoCommand(int size)
852 {
853         int freecommand = dpsoftrast.commandpool.freecommand;
854         int usedcommands = dpsoftrast.commandpool.usedcommands;
855         freecommand -= size;
856         usedcommands -= size;
857         dpsoftrast.commandpool.freecommand = freecommand;
858         dpsoftrast.commandpool.usedcommands = usedcommands;
859 }
860                 
861 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
862 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
863 {
864         thread->viewport[0] = command->x;
865         thread->viewport[1] = command->y;
866         thread->viewport[2] = command->width;
867         thread->viewport[3] = command->height;
868         thread->validate |= DPSOFTRAST_VALIDATE_FB;
869 }
870 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
871 {
872         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
873         command->x = x;
874         command->y = y;
875         command->width = width;
876         command->height = height;
877
878         dpsoftrast.viewport[0] = x;
879         dpsoftrast.viewport[1] = y;
880         dpsoftrast.viewport[2] = width;
881         dpsoftrast.viewport[3] = height;
882         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
883 }
884
885 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
886 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
887 {
888         int i, x1, y1, x2, y2, w, h, x, y, t1, t2;
889         unsigned int *p;
890         unsigned int c;
891         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
892         x1 = thread->fb_clearscissor[0];
893         y1 = thread->fb_clearscissor[1];
894         x2 = thread->fb_clearscissor[0] + thread->fb_clearscissor[2];
895         y2 = thread->fb_clearscissor[1] + thread->fb_clearscissor[3];
896         t1 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
897         t2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
898         if(y1 < t1) y1 = t1;
899         if(y2 > t2) y2 = t2;
900         w = x2 - x1;
901         h = y2 - y1;
902         if (w < 1 || h < 1)
903                 return;
904         // FIXME: honor fb_colormask?
905         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
906         for (i = 0;i < 4;i++)
907         {
908                 if (!dpsoftrast.fb_colorpixels[i])
909                         continue;
910                 for (y = y1;y < y2;y++)
911                 {
912                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
913                         for (x = x1;x < x2;x++)
914                                 p[x] = c;
915                 }
916         }
917 }
918 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
919 {
920         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
921         command->r = r;
922         command->g = g;
923         command->b = b;
924         command->a = a;
925 }
926
927 DEFCOMMAND(3, ClearDepth, float depth;)
928 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
929 {
930         int x1, y1, x2, y2, w, h, x, y, t1, t2;
931         unsigned int *p;
932         unsigned int c;
933         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
934         x1 = thread->fb_clearscissor[0];
935         y1 = thread->fb_clearscissor[1];
936         x2 = thread->fb_clearscissor[0] + thread->fb_clearscissor[2];
937         y2 = thread->fb_clearscissor[1] + thread->fb_clearscissor[3];
938         t1 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
939         t2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
940         if(y1 < t1) y1 = t1;
941         if(y2 > t2) y2 = t2;
942         w = x2 - x1;
943         h = y2 - y1;
944         if (w < 1 || h < 1)
945                 return;
946         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
947         for (y = y1;y < y2;y++)
948         {
949                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
950                 for (x = x1;x < x2;x++)
951                         p[x] = c;
952         }
953 }
954 void DPSOFTRAST_ClearDepth(float d)
955 {
956         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
957         command->depth = d;
958 }
959
960 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
961 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
962 {
963         thread->colormask[0] = command->r != 0;
964         thread->colormask[1] = command->g != 0;
965         thread->colormask[2] = command->b != 0;
966         thread->colormask[3] = command->a != 0;
967         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
968 }
969 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
970 {
971         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
972         command->r = r;
973         command->g = g;
974         command->b = b;
975         command->a = a;
976 }
977
978 DEFCOMMAND(5, DepthTest, int enable;)
979 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
980 {
981         thread->depthtest = command->enable;
982         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
983 }
984 void DPSOFTRAST_DepthTest(int enable)
985 {
986         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
987         command->enable = enable;
988 }
989
990 DEFCOMMAND(6, ScissorTest, int enable;)
991 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
992 {
993         thread->scissortest = command->enable;
994         thread->validate |= DPSOFTRAST_VALIDATE_FB;
995 }
996 void DPSOFTRAST_ScissorTest(int enable)
997 {
998         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
999         command->enable = enable;
1000 }
1001
1002 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1003 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1004 {
1005         thread->scissor[0] = command->x;
1006         thread->scissor[1] = command->y;
1007         thread->scissor[2] = command->width;
1008         thread->scissor[3] = command->height;
1009         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1010 }
1011 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1012 {
1013         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1014         command->x = x;
1015         command->y = y;
1016         command->width = width;
1017         command->height = height;
1018 }
1019
1020 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1021 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1022 {
1023         thread->blendfunc[0] = command->sfactor;
1024         thread->blendfunc[1] = command->dfactor;
1025         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1026 }
1027 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1028 {
1029         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1030         command->sfactor = sfactor;
1031         command->dfactor = dfactor;
1032 }
1033
1034 DEFCOMMAND(9, BlendSubtract, int enable;)
1035 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1036 {
1037         thread->blendsubtract = command->enable;
1038         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1039 }
1040 void DPSOFTRAST_BlendSubtract(int enable)
1041 {
1042         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1043         command->enable = enable;
1044 }
1045
1046 DEFCOMMAND(10, DepthMask, int enable;)
1047 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1048 {
1049         thread->depthmask = command->enable;
1050 }
1051 void DPSOFTRAST_DepthMask(int enable)
1052 {
1053         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1054         command->enable = enable;
1055 }
1056
1057 DEFCOMMAND(11, DepthFunc, int func;)
1058 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1059 {
1060         thread->depthfunc = command->func;
1061 }
1062 void DPSOFTRAST_DepthFunc(int func)
1063 {
1064         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1065         command->func = func;
1066 }
1067
1068 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1069 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1070 {
1071         thread->depthrange[0] = command->nearval;
1072         thread->depthrange[1] = command->farval;
1073 }
1074 void DPSOFTRAST_DepthRange(float nearval, float farval)
1075 {
1076         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1077         command->nearval = nearval;
1078         command->farval = farval;
1079 }
1080
1081 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1082 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1083 {
1084         thread->polygonoffset[0] = command->alongnormal;
1085         thread->polygonoffset[1] = command->intoview;
1086 }
1087 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1088 {
1089         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1090         command->alongnormal = alongnormal;
1091         command->intoview = intoview;
1092 }
1093
1094 DEFCOMMAND(14, CullFace, int mode;)
1095 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1096 {
1097         thread->cullface = command->mode;
1098 }
1099 void DPSOFTRAST_CullFace(int mode)
1100 {
1101         DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1102         command->mode = mode;
1103 }
1104
1105 DEFCOMMAND(15, AlphaTest, int enable;)
1106 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1107 {
1108         thread->alphatest = command->enable;
1109 }
1110 void DPSOFTRAST_AlphaTest(int enable)
1111 {
1112         DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1113         command->enable = enable;
1114 }
1115
1116 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1117 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1118 {
1119         thread->alphafunc = command->func;
1120         thread->alphavalue = command->ref;
1121 }
1122 void DPSOFTRAST_AlphaFunc(int func, float ref)
1123 {
1124         DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1125         command->func = func;
1126         command->ref = ref;
1127 }
1128
1129 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1130 {
1131         dpsoftrast.color[0] = r;
1132         dpsoftrast.color[1] = g;
1133         dpsoftrast.color[2] = b;
1134         dpsoftrast.color[3] = a;
1135 }
1136
1137 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1138 {
1139         int outstride = blockwidth * 4;
1140         int instride = dpsoftrast.fb_width * 4;
1141         int bx1 = blockx;
1142         int by1 = blocky;
1143         int bx2 = blockx + blockwidth;
1144         int by2 = blocky + blockheight;
1145         int bw;
1146         int bh;
1147         int x;
1148         int y;
1149         unsigned char *inpixels;
1150         unsigned char *b;
1151         unsigned char *o;
1152         DPSOFTRAST_Flush();
1153         if (bx1 < 0) bx1 = 0;
1154         if (by1 < 0) by1 = 0;
1155         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1156         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1157         bw = bx2 - bx1;
1158         bh = by2 - by1;
1159         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1160         if (dpsoftrast.bigendian)
1161         {
1162                 for (y = by1;y < by2;y++)
1163                 {
1164                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1165                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1166                         for (x = bx1;x < bx2;x++)
1167                         {
1168                                 o[0] = b[3];
1169                                 o[1] = b[2];
1170                                 o[2] = b[1];
1171                                 o[3] = b[0];
1172                                 o += 4;
1173                                 b += 4;
1174                         }
1175                 }
1176         }
1177         else
1178         {
1179                 for (y = by1;y < by2;y++)
1180                 {
1181                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1182                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1183                         memcpy(o, b, bw*4);
1184                 }
1185         }
1186
1187 }
1188 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1189 {
1190         int tx1 = tx;
1191         int ty1 = ty;
1192         int tx2 = tx + width;
1193         int ty2 = ty + height;
1194         int sx1 = sx;
1195         int sy1 = sy;
1196         int sx2 = sx + width;
1197         int sy2 = sy + height;
1198         int swidth;
1199         int sheight;
1200         int twidth;
1201         int theight;
1202         int sw;
1203         int sh;
1204         int tw;
1205         int th;
1206         int y;
1207         unsigned int *spixels;
1208         unsigned int *tpixels;
1209         DPSOFTRAST_Texture *texture;
1210         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1211         if (mip < 0 || mip >= texture->mipmaps) return;
1212         if (texture->binds)
1213                 DPSOFTRAST_Flush();
1214         spixels = dpsoftrast.fb_colorpixels[0];
1215         swidth = dpsoftrast.fb_width;
1216         sheight = dpsoftrast.fb_height;
1217         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1218         twidth = texture->mipmap[mip][2];
1219         theight = texture->mipmap[mip][3];
1220         if (tx1 < 0) tx1 = 0;
1221         if (ty1 < 0) ty1 = 0;
1222         if (tx2 > twidth) tx2 = twidth;
1223         if (ty2 > theight) ty2 = theight;
1224         if (sx1 < 0) sx1 = 0;
1225         if (sy1 < 0) sy1 = 0;
1226         if (sx2 > swidth) sx2 = swidth;
1227         if (sy2 > sheight) sy2 = sheight;
1228         tw = tx2 - tx1;
1229         th = ty2 - ty1;
1230         sw = sx2 - sx1;
1231         sh = sy2 - sy1;
1232         if (tw > sw) tw = sw;
1233         if (th > sh) th = sh;
1234         if (tw < 1 || th < 1)
1235                 return;
1236         for (y = 0;y < th;y++)
1237                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 + y) * swidth + sx1), tw*4);
1238         if (texture->mipmaps > 1)
1239                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1240 }
1241
1242 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1243 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1244 {
1245         if (thread->texbound[command->unitnum])
1246                 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1247         thread->texbound[command->unitnum] = command->texture;
1248 }
1249 void DPSOFTRAST_SetTexture(int unitnum, int index)
1250 {
1251         DPSOFTRAST_Command_SetTexture *command;
1252         DPSOFTRAST_Texture *texture;
1253         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1254         {
1255                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1256                 return;
1257         }
1258         texture = DPSOFTRAST_Texture_GetByIndex(index);
1259         if (index && !texture)
1260         {
1261                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1262                 return;
1263         }
1264
1265         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1266         command->unitnum = unitnum;
1267         command->texture = texture;
1268
1269         dpsoftrast.texbound[unitnum] = texture;
1270         ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1271 }
1272
1273 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1274 {
1275         dpsoftrast.pointer_vertex3f = vertex3f;
1276         dpsoftrast.stride_vertex = stride;
1277 }
1278 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1279 {
1280         dpsoftrast.pointer_color4f = color4f;
1281         dpsoftrast.pointer_color4ub = NULL;
1282         dpsoftrast.stride_color = stride;
1283 }
1284 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1285 {
1286         dpsoftrast.pointer_color4f = NULL;
1287         dpsoftrast.pointer_color4ub = color4ub;
1288         dpsoftrast.stride_color = stride;
1289 }
1290 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1291 {
1292         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1293         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1294         dpsoftrast.stride_texcoord[unitnum] = stride;
1295 }
1296
1297 DEFCOMMAND(18, SetShader, int mode; int permutation;)
1298 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1299 {
1300         thread->shader_mode = command->mode;
1301         thread->shader_permutation = command->permutation;
1302 }
1303 void DPSOFTRAST_SetShader(int mode, int permutation)
1304 {
1305         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1306         command->mode = mode;
1307         command->permutation = permutation;
1308
1309         dpsoftrast.shader_mode = mode;
1310         dpsoftrast.shader_permutation = permutation;
1311 }
1312
1313 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1314 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1315 {
1316         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1317 }
1318 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1319 {
1320         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1321         command->index = index;
1322         command->val[0] = v0;
1323         command->val[1] = v1;
1324         command->val[2] = v2;
1325         command->val[3] = v3;
1326
1327         dpsoftrast.uniform4f[index*4+0] = v0;
1328         dpsoftrast.uniform4f[index*4+1] = v1;
1329         dpsoftrast.uniform4f[index*4+2] = v2;
1330         dpsoftrast.uniform4f[index*4+3] = v3;
1331 }
1332 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1333 {
1334         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1335         command->index = index;
1336         memcpy(command->val, v, sizeof(command->val));
1337
1338         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1339 }
1340
1341 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1342 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1343 {
1344         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1345 }
1346 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1347 {
1348 #ifdef SSE2_PRESENT
1349         int i, index;
1350         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1351         {
1352                 __m128 m0, m1, m2, m3;
1353                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1354                 command->index = index;
1355                 if (((size_t)v)&(ALIGN_SIZE-1))
1356                 {
1357                         m0 = _mm_loadu_ps(v);
1358                         m1 = _mm_loadu_ps(v+4);
1359                         m2 = _mm_loadu_ps(v+8);
1360                         m3 = _mm_loadu_ps(v+12);
1361                 }
1362                 else
1363                 {
1364                         m0 = _mm_load_ps(v);
1365                         m1 = _mm_load_ps(v+4);
1366                         m2 = _mm_load_ps(v+8);
1367                         m3 = _mm_load_ps(v+12);
1368                 }
1369                 if (transpose)
1370                 {
1371                         __m128 t0, t1, t2, t3;
1372                         t0 = _mm_unpacklo_ps(m0, m1);
1373                         t1 = _mm_unpacklo_ps(m2, m3);
1374                         t2 = _mm_unpackhi_ps(m0, m1);
1375                         t3 = _mm_unpackhi_ps(m2, m3);
1376                         m0 = _mm_movelh_ps(t0, t1);
1377                         m1 = _mm_movehl_ps(t1, t0);
1378                         m2 = _mm_movelh_ps(t2, t3);
1379                         m3 = _mm_movehl_ps(t3, t2);                     
1380                 }
1381                 _mm_store_ps(command->val, m0);
1382                 _mm_store_ps(command->val+4, m1);
1383                 _mm_store_ps(command->val+8, m2);
1384                 _mm_store_ps(command->val+12, m3);
1385                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1386                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1387                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1388                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1389         }
1390 #endif
1391 }
1392
1393 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1394 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1395 {
1396         thread->uniform1i[command->index] = command->val;
1397 }
1398 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1399 {
1400         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1401         command->index = index;
1402         command->val = i0;
1403
1404         dpsoftrast.uniform1i[command->index] = i0;
1405 }
1406
1407 #ifdef SSE2_PRESENT
1408 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1409 {
1410         float *end = dst + size*4;
1411         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1412         {
1413                 while (dst < end)
1414                 {
1415                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1416                         dst += 4;
1417                         src += stride;
1418                 }
1419         }
1420         else
1421         {
1422                 while (dst < end)
1423                 {
1424                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1425                         dst += 4;
1426                         src += stride;
1427                 }
1428         }
1429 }
1430
1431 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1432 {
1433         float *end = dst + size*4;
1434         if (stride == sizeof(float[3]))
1435         {
1436                 float *end4 = dst + (size&~3)*4;        
1437                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1438                 {
1439                         while (dst < end4)
1440                         {
1441                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1442                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1443                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1444                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1445                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1446                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1447                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1448                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1449                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1450                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1451                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1452                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1453                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1454                                 dst += 16;
1455                                 src += 4*sizeof(float[3]);
1456                         }
1457                 }
1458                 else
1459                 {
1460                         while (dst < end4)
1461                         {
1462                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1463                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1464                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1465                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1466                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1467                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1468                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1469                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1470                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1471                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1472                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1473                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1474                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1475                                 dst += 16;
1476                                 src += 4*sizeof(float[3]);
1477                         }
1478                 }
1479         }
1480         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1481         {
1482                 while (dst < end)
1483                 {
1484                         __m128 v = _mm_loadu_ps((const float *)src);
1485                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1486                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1487                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1488                         _mm_store_ps(dst, v);
1489                         dst += 4;
1490                         src += stride;
1491                 }
1492         }
1493         else
1494         {
1495                 while (dst < end)
1496                 {
1497                         __m128 v = _mm_load_ps((const float *)src);
1498                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1499                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1500                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1501                         _mm_store_ps(dst, v);
1502                         dst += 4;
1503                         src += stride;
1504                 }
1505         }
1506 }
1507
1508 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1509 {
1510         float *end = dst + size*4;
1511         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1512         if (stride == sizeof(float[2]))
1513         {
1514                 float *end2 = dst + (size&~1)*4;
1515                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1516                 {
1517                         while (dst < end2)
1518                         {
1519                                 __m128 v = _mm_loadu_ps((const float *)src);
1520                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1521                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1522                                 dst += 8;
1523                                 src += 2*sizeof(float[2]);
1524                         }
1525                 }
1526                 else
1527                 {
1528                         while (dst < end2)
1529                         {
1530                                 __m128 v = _mm_load_ps((const float *)src);
1531                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1532                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1533                                 dst += 8;
1534                                 src += 2*sizeof(float[2]);
1535                         }
1536                 }
1537         }
1538         while (dst < end)
1539         {
1540                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1541                 dst += 4;
1542                 src += stride;
1543         }
1544 }
1545
1546 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1547 {
1548         float *end = dst + size*4;
1549         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1550         if (stride == sizeof(unsigned char[4]))
1551         {
1552                 float *end4 = dst + (size&~3)*4;
1553                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1554                 {
1555                         while (dst < end4)
1556                         {
1557                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1558                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1559                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1560                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1561                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1562                                 dst += 16;
1563                                 src += 4*sizeof(unsigned char[4]);
1564                         }
1565                 }
1566                 else
1567                 {
1568                         while (dst < end4)
1569                         {
1570                                 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1571                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1572                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1573                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1574                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1575                                 dst += 16;
1576                                 src += 4*sizeof(unsigned char[4]);
1577                         }
1578                 }
1579         }
1580         while (dst < end)
1581         {
1582                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1583                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1584                 dst += 4;
1585                 src += stride;
1586         }
1587 }
1588
1589 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1590 {
1591         float *end = dst + 4*size;
1592         __m128 v = _mm_loadu_ps(src);
1593         while (dst < end)
1594         {
1595                 _mm_store_ps(dst, v);
1596                 dst += 4;
1597         }
1598 }
1599 #endif
1600
1601 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1602 {
1603 #ifdef SSE2_PRESENT
1604         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1605         __m128 m0, m1, m2, m3;
1606         float *end;
1607         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1608         {
1609                 // fast case for identity matrix
1610                 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1611                 return;
1612         }
1613         end = out4f + numitems*4;
1614         m0 = _mm_loadu_ps(inmatrix16f);
1615         m1 = _mm_loadu_ps(inmatrix16f + 4);
1616         m2 = _mm_loadu_ps(inmatrix16f + 8);
1617         m3 = _mm_loadu_ps(inmatrix16f + 12);
1618         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1619         {
1620                 while (out4f < end)
1621                 {
1622                         __m128 v = _mm_loadu_ps(in4f);
1623                         _mm_store_ps(out4f,
1624                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1625                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1626                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1627                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1628                         out4f += 4;
1629                         in4f += 4;
1630                 }
1631         }
1632         else
1633         {
1634                 while (out4f < end)
1635                 {
1636                         __m128 v = _mm_load_ps(in4f);
1637                         _mm_store_ps(out4f,
1638                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1639                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1640                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1641                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1642                         out4f += 4;
1643                         in4f += 4;
1644                 }
1645         }
1646 #endif
1647 }
1648
1649 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1650 {
1651         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1652 }
1653
1654 #ifdef SSE2_PRESENT
1655 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1656 { \
1657         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1658         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1659         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1660         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1661 }
1662
1663 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1664 { \
1665         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1666         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1667         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1668         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1669 }
1670
1671 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1672 { \
1673         __m128 p = (in); \
1674         out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1675                                                   _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1676                                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1677                                                                                         _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1678 }
1679
1680 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, __m128 minpos, __m128 maxpos, __m128 viewportcenter, __m128 viewportscale, __m128 m0, __m128 m1, __m128 m2, __m128 m3)
1681 {
1682         int clipmask = 0xFF;
1683         __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1684         m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1685         m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1686         m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1687         m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1688         #define BBFRONT(k, pos) \
1689         { \
1690                 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1691                 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1692                 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1693                 { \
1694                         __m128 proj; \
1695                         clipmask &= ~(1<<k); \
1696                         proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1697                         minproj = _mm_min_ss(minproj, proj); \
1698                         maxproj = _mm_max_ss(maxproj, proj); \
1699                 } \
1700         }
1701         BBFRONT(0, minpos); 
1702         BBFRONT(1, _mm_move_ss(minpos, maxpos)); 
1703         BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1704         BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1705         BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1706         BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1707         BBFRONT(6, _mm_move_ss(maxpos, minpos)); 
1708         BBFRONT(7, maxpos);
1709         #define BBCLIP(k) \
1710         { \
1711                 if (clipmask&(1<<k)) \
1712                 { \
1713                         if (!(clipmask&(1<<(k^1)))) \
1714                         { \
1715                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1716                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1717                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1718                                 minproj = _mm_min_ss(minproj, proj); \
1719                                 maxproj = _mm_max_ss(maxproj, proj); \
1720                         } \
1721                         if (!(clipmask&(1<<(k^2)))) \
1722                         { \
1723                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1724                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1725                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1726                                 minproj = _mm_min_ss(minproj, proj); \
1727                                 maxproj = _mm_max_ss(maxproj, proj); \
1728                         } \
1729                         if (!(clipmask&(1<<(k^4)))) \
1730                         { \
1731                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1732                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1733                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1734                                 minproj = _mm_min_ss(minproj, proj); \
1735                                 maxproj = _mm_max_ss(maxproj, proj); \
1736                         } \
1737                 } \
1738         }
1739         BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1740         viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1741         viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1742         minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1743         maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1744         minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1745         maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1746         *starty = _mm_cvttss_si32(maxproj);
1747         *endy = _mm_cvttss_si32(minproj)+1;
1748         return clipmask;
1749 }
1750 #endif
1751         
1752 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1753 {
1754 #ifdef SSE2_PRESENT
1755         float *end = out4f + numitems*4;
1756         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1757         __m128 minpos, maxpos;
1758         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1759         {
1760                 minpos = maxpos = _mm_loadu_ps(in4f);
1761                 while (out4f < end)
1762                 {
1763                         __m128 v = _mm_loadu_ps(in4f);
1764                         minpos = _mm_min_ps(minpos, v);
1765                         maxpos = _mm_max_ps(maxpos, v);
1766                         _mm_store_ps(out4f, v);
1767                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1768                         _mm_store_ps(screen4f, v);
1769                         in4f += 4;
1770                         out4f += 4;
1771                         screen4f += 4;
1772                 }
1773         }
1774         else
1775         {
1776                 minpos = maxpos = _mm_load_ps(in4f);
1777                 while (out4f < end)
1778                 {
1779                         __m128 v = _mm_load_ps(in4f);
1780                         minpos = _mm_min_ps(minpos, v);
1781                         maxpos = _mm_max_ps(maxpos, v);
1782                         _mm_store_ps(out4f, v);
1783                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1784                         _mm_store_ps(screen4f, v);
1785                         in4f += 4;
1786                         out4f += 4;
1787                         screen4f += 4;
1788                 }
1789         }
1790         if (starty && endy) 
1791                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, 
1792                                         _mm_setr_ps(1.0f, 0.0f, 0.0f, 0.0f),
1793                                         _mm_setr_ps(0.0f, 1.0f, 0.0f, 0.0f),
1794                                         _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f),
1795                                         _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f));
1796         return 0;
1797 #endif
1798 }
1799
1800 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1801 {
1802 #ifdef SSE2_PRESENT
1803         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1804         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1805         float *end;
1806         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1807                 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1808         end = out4f + numitems*4;
1809         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1810         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1811         m0 = _mm_loadu_ps(inmatrix16f);
1812         m1 = _mm_loadu_ps(inmatrix16f + 4);
1813         m2 = _mm_loadu_ps(inmatrix16f + 8);
1814         m3 = _mm_loadu_ps(inmatrix16f + 12);
1815         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1816         {
1817                 minpos = maxpos = _mm_loadu_ps(in4f);
1818                 while (out4f < end)
1819                 {
1820                         __m128 v = _mm_loadu_ps(in4f);
1821                         minpos = _mm_min_ps(minpos, v);
1822                         maxpos = _mm_max_ps(maxpos, v);
1823                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1824                         _mm_store_ps(out4f, v);
1825                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1826                         _mm_store_ps(screen4f, v);
1827                         in4f += 4;
1828                         out4f += 4;
1829                         screen4f += 4;
1830                 }
1831         }
1832         else
1833         {
1834                 minpos = maxpos = _mm_load_ps(in4f);
1835                 while (out4f < end)
1836                 {
1837                         __m128 v = _mm_load_ps(in4f);
1838                         minpos = _mm_min_ps(minpos, v);
1839                         maxpos = _mm_max_ps(maxpos, v);
1840                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1841                         _mm_store_ps(out4f, v);
1842                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1843                         _mm_store_ps(screen4f, v);
1844                         in4f += 4;
1845                         out4f += 4;
1846                         screen4f += 4;
1847                 }
1848         }
1849         if (starty && endy) 
1850                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, m0, m1, m2, m3); 
1851         return 0;
1852 #endif
1853 }
1854
1855 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1856 {
1857         float *outf = dpsoftrast.post_array4f[outarray];
1858         const unsigned char *inb;
1859         int firstvertex = dpsoftrast.firstvertex;
1860         int numvertices = dpsoftrast.numvertices;
1861         int stride;
1862         switch(inarray)
1863         {
1864         case DPSOFTRAST_ARRAY_POSITION:
1865                 stride = dpsoftrast.stride_vertex;
1866                 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1867                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1868                 break;
1869         case DPSOFTRAST_ARRAY_COLOR:
1870                 stride = dpsoftrast.stride_color;
1871                 if (dpsoftrast.pointer_color4f)
1872                 {
1873                         inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1874                         DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1875                 }
1876                 else if (dpsoftrast.pointer_color4ub)
1877                 {
1878                         stride = dpsoftrast.stride_color;
1879                         inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1880                         DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1881                 }
1882                 else
1883                 {
1884                         DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1885                 }
1886                 break;
1887         default:
1888                 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1889                 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1890                 {
1891                         inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1892                         switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1893                         {
1894                         case 2:
1895                                 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1896                                 break;
1897                         case 3:
1898                                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1899                                 break;
1900                         case 4:
1901                                 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1902                                 break;
1903                         }
1904                 }
1905                 break;
1906         }
1907         return outf;
1908 }
1909
1910 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1911 {
1912         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1913         DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1914         return data;
1915 }
1916
1917 #if 0
1918 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1919 {
1920         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1921         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1922         return data;
1923 }
1924 #endif
1925
1926 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1927 {
1928         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1929         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
1930         return data;
1931 }
1932
1933 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1934 {
1935         int x;
1936         int startx = span->startx;
1937         int endx = span->endx;
1938         float wslope = triangle->w[0];
1939         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1940         float endz = 1.0f / (w + wslope * startx);
1941         for (x = startx;x < endx;)
1942         {
1943                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1944                 float z = endz, dz;
1945                 if(nextsub >= endx) nextsub = endsub = endx-1;
1946                 endz = 1.0f / (w + wslope * nextsub);
1947                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1948                 for (; x <= endsub; x++, z += dz)
1949                         zf[x] = z;
1950         }
1951 }
1952
1953 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
1954 {
1955         int x;
1956         int startx = span->startx;
1957         int endx = span->endx;
1958         int d[4];
1959         float a, b;
1960         unsigned char * RESTRICT pixelmask = span->pixelmask;
1961         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1962         if (!pixel)
1963                 return;
1964         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
1965         // handle alphatest now (this affects depth writes too)
1966         if (thread->alphatest)
1967                 for (x = startx;x < endx;x++)
1968                         if (in4f[x*4+3] < 0.5f)
1969                                 pixelmask[x] = false;
1970         // FIXME: this does not handle bigendian
1971         switch(thread->fb_blendmode)
1972         {
1973         case DPSOFTRAST_BLENDMODE_OPAQUE:
1974                 for (x = startx;x < endx;x++)
1975                 {
1976                         if (!pixelmask[x])
1977                                 continue;
1978                         d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
1979                         d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
1980                         d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
1981                         d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
1982                         pixel[x*4+0] = d[0];
1983                         pixel[x*4+1] = d[1];
1984                         pixel[x*4+2] = d[2];
1985                         pixel[x*4+3] = d[3];
1986                 }
1987                 break;
1988         case DPSOFTRAST_BLENDMODE_ALPHA:
1989                 for (x = startx;x < endx;x++)
1990                 {
1991                         if (!pixelmask[x])
1992                                 continue;
1993                         a = in4f[x*4+3] * 255.0f;
1994                         b = 1.0f - in4f[x*4+3];
1995                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
1996                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
1997                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
1998                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
1999                         pixel[x*4+0] = d[0];
2000                         pixel[x*4+1] = d[1];
2001                         pixel[x*4+2] = d[2];
2002                         pixel[x*4+3] = d[3];
2003                 }
2004                 break;
2005         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2006                 for (x = startx;x < endx;x++)
2007                 {
2008                         if (!pixelmask[x])
2009                                 continue;
2010                         a = in4f[x*4+3] * 255.0f;
2011                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2012                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2013                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2014                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2015                         pixel[x*4+0] = d[0];
2016                         pixel[x*4+1] = d[1];
2017                         pixel[x*4+2] = d[2];
2018                         pixel[x*4+3] = d[3];
2019                 }
2020                 break;
2021         case DPSOFTRAST_BLENDMODE_ADD:
2022                 for (x = startx;x < endx;x++)
2023                 {
2024                         if (!pixelmask[x])
2025                                 continue;
2026                         d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2027                         d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2028                         d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2029                         d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2030                         pixel[x*4+0] = d[0];
2031                         pixel[x*4+1] = d[1];
2032                         pixel[x*4+2] = d[2];
2033                         pixel[x*4+3] = d[3];
2034                 }
2035                 break;
2036         case DPSOFTRAST_BLENDMODE_INVMOD:
2037                 for (x = startx;x < endx;x++)
2038                 {
2039                         if (!pixelmask[x])
2040                                 continue;
2041                         d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2042                         d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2043                         d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2044                         d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2045                         pixel[x*4+0] = d[0];
2046                         pixel[x*4+1] = d[1];
2047                         pixel[x*4+2] = d[2];
2048                         pixel[x*4+3] = d[3];
2049                 }
2050                 break;
2051         case DPSOFTRAST_BLENDMODE_MUL:
2052                 for (x = startx;x < endx;x++)
2053                 {
2054                         if (!pixelmask[x])
2055                                 continue;
2056                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2057                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2058                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2059                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2060                         pixel[x*4+0] = d[0];
2061                         pixel[x*4+1] = d[1];
2062                         pixel[x*4+2] = d[2];
2063                         pixel[x*4+3] = d[3];
2064                 }
2065                 break;
2066         case DPSOFTRAST_BLENDMODE_MUL2:
2067                 for (x = startx;x < endx;x++)
2068                 {
2069                         if (!pixelmask[x])
2070                                 continue;
2071                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2072                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2073                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2074                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2075                         pixel[x*4+0] = d[0];
2076                         pixel[x*4+1] = d[1];
2077                         pixel[x*4+2] = d[2];
2078                         pixel[x*4+3] = d[3];
2079                 }
2080                 break;
2081         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2082                 for (x = startx;x < endx;x++)
2083                 {
2084                         if (!pixelmask[x])
2085                                 continue;
2086                         a = in4f[x*4+3] * -255.0f;
2087                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2088                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2089                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2090                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2091                         pixel[x*4+0] = d[0];
2092                         pixel[x*4+1] = d[1];
2093                         pixel[x*4+2] = d[2];
2094                         pixel[x*4+3] = d[3];
2095                 }
2096                 break;
2097         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2098                 for (x = startx;x < endx;x++)
2099                 {
2100                         if (!pixelmask[x])
2101                                 continue;
2102                         a = 255.0f;
2103                         b = 1.0f - in4f[x*4+3];
2104                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2105                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2106                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2107                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2108                         pixel[x*4+0] = d[0];
2109                         pixel[x*4+1] = d[1];
2110                         pixel[x*4+2] = d[2];
2111                         pixel[x*4+3] = d[3];
2112                 }
2113                 break;
2114         }
2115 }
2116
2117 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2118 {
2119 #ifdef SSE2_PRESENT
2120         int x;
2121         int startx = span->startx;
2122         int endx = span->endx;
2123         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2124         unsigned char * RESTRICT pixelmask = span->pixelmask;
2125         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2126         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2127         if (!pixel)
2128                 return;
2129         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2130         pixeli += span->y * dpsoftrast.fb_width + span->x;
2131         // handle alphatest now (this affects depth writes too)
2132         if (thread->alphatest)
2133                 for (x = startx;x < endx;x++)
2134                         if (in4ub[x*4+3] < 0.5f)
2135                                 pixelmask[x] = false;
2136         // FIXME: this does not handle bigendian
2137         switch(thread->fb_blendmode)
2138         {
2139         case DPSOFTRAST_BLENDMODE_OPAQUE:
2140                 for (x = startx;x + 4 <= endx;)
2141                 {
2142                         if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2143                         {
2144                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2145                                 x += 4;
2146                         }
2147                         else
2148                         {
2149                                 if (pixelmask[x])
2150                                         pixeli[x] = ini[x];
2151                                 x++;
2152                         }
2153                 }
2154                 for (;x < endx;x++)
2155                         if (pixelmask[x])
2156                                 pixeli[x] = ini[x];
2157                 break;
2158         case DPSOFTRAST_BLENDMODE_ALPHA:
2159         #define FINISHBLEND(blend2, blend1) \
2160                 for (x = startx;x + 2 <= endx;x += 2) \
2161                 { \
2162                         __m128i src, dst; \
2163                         switch (*(const unsigned short*)&pixelmask[x]) \
2164                         { \
2165                         case 0x0101: \
2166                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2167                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2168                                 blend2; \
2169                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2170                                 continue; \
2171                         case 0x0100: \
2172                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2173                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2174                                 blend1; \
2175                                 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst));  \
2176                                 continue; \
2177                         case 0x0001: \
2178                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2179                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2180                                 blend1; \
2181                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2182                                 continue; \
2183                         } \
2184                         break; \
2185                 } \
2186                 for(;x < endx; x++) \
2187                 { \
2188                         __m128i src, dst; \
2189                         if (!pixelmask[x]) \
2190                                 continue; \
2191                         src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2192                         dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2193                         blend1; \
2194                         pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2195                 }
2196
2197                 FINISHBLEND({
2198                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2199                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2200                 }, {
2201                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2202                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2203                 });
2204                 break;
2205         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2206                 FINISHBLEND({
2207                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2208                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2209                 }, {
2210                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2211                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2212                 });
2213                 break;
2214         case DPSOFTRAST_BLENDMODE_ADD:
2215                 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2216                 break;
2217         case DPSOFTRAST_BLENDMODE_INVMOD:
2218                 FINISHBLEND({
2219                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2220                 }, {
2221                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2222                 });
2223                 break;
2224         case DPSOFTRAST_BLENDMODE_MUL:
2225                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2226                 break;
2227         case DPSOFTRAST_BLENDMODE_MUL2:
2228                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2229                 break;
2230         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2231                 FINISHBLEND({
2232                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2233                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2234                 }, {
2235                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2236                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2237                 });
2238                 break;
2239         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2240                 FINISHBLEND({
2241                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2242                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2243                 }, {
2244                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2245                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2246                 });
2247                 break;
2248         }
2249 #endif
2250 }
2251
2252 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2253 {
2254         int x;
2255         int startx = span->startx;
2256         int endx = span->endx;
2257         int flags;
2258         float c[4];
2259         float data[4];
2260         float slope[4];
2261         float tc[2], endtc[2];
2262         float tcscale[2];
2263         unsigned int tci[2];
2264         unsigned int tci1[2];
2265         unsigned int tcimin[2];
2266         unsigned int tcimax[2];
2267         int tciwrapmask[2];
2268         int tciwidth;
2269         int filter;
2270         int mip;
2271         const unsigned char * RESTRICT pixelbase;
2272         const unsigned char * RESTRICT pixel[4];
2273         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2274         // if no texture is bound, just fill it with white
2275         if (!texture)
2276         {
2277                 for (x = startx;x < endx;x++)
2278                 {
2279                         out4f[x*4+0] = 1.0f;
2280                         out4f[x*4+1] = 1.0f;
2281                         out4f[x*4+2] = 1.0f;
2282                         out4f[x*4+3] = 1.0f;
2283                 }
2284                 return;
2285         }
2286         mip = triangle->mip[texunitindex];
2287         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2288         // if this mipmap of the texture is 1 pixel, just fill it with that color
2289         if (texture->mipmap[mip][1] == 4)
2290         {
2291                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2292                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2293                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2294                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2295                 for (x = startx;x < endx;x++)
2296                 {
2297                         out4f[x*4+0] = c[0];
2298                         out4f[x*4+1] = c[1];
2299                         out4f[x*4+2] = c[2];
2300                         out4f[x*4+3] = c[3];
2301                 }
2302                 return;
2303         }
2304         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2305         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2306         flags = texture->flags;
2307         tcscale[0] = texture->mipmap[mip][2];
2308         tcscale[1] = texture->mipmap[mip][3];
2309         tciwidth = texture->mipmap[mip][2];
2310         tcimin[0] = 0;
2311         tcimin[1] = 0;
2312         tcimax[0] = texture->mipmap[mip][2]-1;
2313         tcimax[1] = texture->mipmap[mip][3]-1;
2314         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2315         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2316         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
2317         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
2318         for (x = startx;x < endx;)
2319         {
2320                 unsigned int subtc[2];
2321                 unsigned int substep[2];
2322                 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2323                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2324                 if(nextsub >= endx)
2325                 {
2326                         nextsub = endsub = endx-1;      
2327                         if(x < nextsub) subscale = 65536.0f / (nextsub - x);
2328                 }
2329                 tc[0] = endtc[0];
2330                 tc[1] = endtc[1];
2331                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
2332                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
2333                 substep[0] = (endtc[0] - tc[0]) * subscale;
2334                 substep[1] = (endtc[1] - tc[1]) * subscale;
2335                 subtc[0] = tc[0] * (1<<16);
2336                 subtc[1] = tc[1] * (1<<16);
2337                 if(filter)
2338                 {
2339                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2340                         {
2341                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2342                                 {
2343                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2344                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2345                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2346                                         tci[0] = subtc[0]>>16;
2347                                         tci[1] = subtc[1]>>16;
2348                                         tci1[0] = tci[0] + 1;
2349                                         tci1[1] = tci[1] + 1;
2350                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2351                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2352                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2353                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2354                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2355                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2356                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2357                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2358                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2359                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2360                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2361                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2362                                         out4f[x*4+0] = c[0];
2363                                         out4f[x*4+1] = c[1];
2364                                         out4f[x*4+2] = c[2];
2365                                         out4f[x*4+3] = c[3];
2366                                 }
2367                         }
2368                         else
2369                         {
2370                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2371                                 {
2372                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2373                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2374                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2375                                         tci[0] = subtc[0]>>16;
2376                                         tci[1] = subtc[1]>>16;
2377                                         tci1[0] = tci[0] + 1;
2378                                         tci1[1] = tci[1] + 1;
2379                                         tci[0] &= tciwrapmask[0];
2380                                         tci[1] &= tciwrapmask[1];
2381                                         tci1[0] &= tciwrapmask[0];
2382                                         tci1[1] &= tciwrapmask[1];
2383                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2384                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2385                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2386                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2387                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2388                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2389                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2390                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2391                                         out4f[x*4+0] = c[0];
2392                                         out4f[x*4+1] = c[1];
2393                                         out4f[x*4+2] = c[2];
2394                                         out4f[x*4+3] = c[3];
2395                                 }
2396                         }
2397                 }
2398                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2399                 {
2400                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2401                         {
2402                                 tci[0] = subtc[0]>>16;
2403                                 tci[1] = subtc[1]>>16;
2404                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2405                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2406                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2407                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2408                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2409                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2410                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2411                                 out4f[x*4+0] = c[0];
2412                                 out4f[x*4+1] = c[1];
2413                                 out4f[x*4+2] = c[2];
2414                                 out4f[x*4+3] = c[3];
2415                         }
2416                 }
2417                 else
2418                 {
2419                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2420                         {
2421                                 tci[0] = subtc[0]>>16;
2422                                 tci[1] = subtc[1]>>16;
2423                                 tci[0] &= tciwrapmask[0];
2424                                 tci[1] &= tciwrapmask[1];
2425                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2426                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2427                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2428                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2429                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2430                                 out4f[x*4+0] = c[0];
2431                                 out4f[x*4+1] = c[1];
2432                                 out4f[x*4+2] = c[2];
2433                                 out4f[x*4+3] = c[3];
2434                         }
2435                 }
2436         }
2437 }
2438
2439 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2440 {
2441 #ifdef SSE2_PRESENT
2442         int x;
2443         int startx = span->startx;
2444         int endx = span->endx;
2445         int flags;
2446         __m128 data, slope, tcscale;
2447         __m128i tcsize, tcmask, tcoffset, tcmax;
2448         __m128 tc, endtc;
2449         __m128i subtc, substep, endsubtc;
2450         int filter;
2451         int mip;
2452         unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2453         const unsigned char * RESTRICT pixelbase;
2454         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2455         // if no texture is bound, just fill it with white
2456         if (!texture)
2457         {
2458                 memset(out4ub + startx*4, 255, span->length*4);
2459                 return;
2460         }
2461         mip = triangle->mip[texunitindex];
2462         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2463         // if this mipmap of the texture is 1 pixel, just fill it with that color
2464         if (texture->mipmap[mip][1] == 4)
2465         {
2466                 unsigned int k = *((const unsigned int *)pixelbase);
2467                 for (x = startx;x < endx;x++)
2468                         outi[x] = k;
2469                 return;
2470         }
2471         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2472         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2473         flags = texture->flags;
2474         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2475         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2476         tcscale = _mm_cvtepi32_ps(tcsize);
2477         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2478         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2479         endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
2480         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2481         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2482         tcmax = _mm_packs_epi32(tcmask, tcmask);
2483         for (x = startx;x < endx;)
2484         {
2485                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2486                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2487                 if(nextsub >= endx)
2488                 {
2489                         nextsub = endsub = endx-1;
2490                         if(x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2491                 }       
2492                 tc = endtc;
2493                 subtc = endsubtc;
2494                 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
2495                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2496                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2497                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2498                 substep = _mm_slli_epi32(substep, 1);
2499                 if (filter)
2500                 {
2501                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2502                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2503                         {
2504                                 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2505                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2506                                 {
2507                                         const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;                   
2508                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2509                                         tci = _mm_madd_epi16(tci, tcoffset);
2510                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2511                                         ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2512                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2513                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2514                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2515                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2516                                         fracm = _mm_srli_epi16(subtc, 1);
2517                                         pix1 = _mm_add_epi16(pix1,
2518                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2519                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2520                                         pix3 = _mm_add_epi16(pix3,
2521                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2522                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2523                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2524                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2525                                         pix2 = _mm_add_epi16(pix2,
2526                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2527                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2528                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2529                                 }
2530                                 if (x <= endsub)
2531                                 {
2532                                         const unsigned char * RESTRICT ptr1;
2533                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2534                                         tci = _mm_madd_epi16(tci, tcoffset);
2535                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2536                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2537                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2538                                         fracm = _mm_srli_epi16(subtc, 1);
2539                                         pix1 = _mm_add_epi16(pix1,
2540                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2541                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2542                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2543                                         pix1 = _mm_add_epi16(pix1,
2544                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2545                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2546                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2547                                         x++;
2548                                 }
2549                         }
2550                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2551                         {
2552                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2553                                 {
2554                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2555                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2556                                         tci = _mm_madd_epi16(tci, tcoffset);
2557                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2558                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2559                                                                                         _mm_setzero_si128());
2560                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2561                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2562                                                                                         _mm_setzero_si128());
2563                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2564                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2565                                         tci = _mm_madd_epi16(tci, tcoffset);
2566                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2567                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2568                                                                                         _mm_setzero_si128());
2569                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2570                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2571                                                                                         _mm_setzero_si128());
2572                                         fracm = _mm_srli_epi16(subtc, 1);
2573                                         pix1 = _mm_add_epi16(pix1,
2574                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2575                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2576                                         pix3 = _mm_add_epi16(pix3,
2577                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2578                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2579                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2580                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2581                                         pix2 = _mm_add_epi16(pix2,
2582                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2583                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2584                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2585                                 }
2586                                 if (x <= endsub)
2587                                 {
2588                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2589                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2590                                         tci = _mm_madd_epi16(tci, tcoffset);
2591                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2592                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2593                                                                                         _mm_setzero_si128());
2594                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2595                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2596                                                                                         _mm_setzero_si128());
2597                                         fracm = _mm_srli_epi16(subtc, 1);
2598                                         pix1 = _mm_add_epi16(pix1,
2599                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2600                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2601                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2602                                         pix1 = _mm_add_epi16(pix1,
2603                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2604                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2605                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2606                                         x++;
2607                                 }
2608                         }
2609                         else
2610                         {
2611                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2612                                 {
2613                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2614                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2615                                         tci = _mm_madd_epi16(tci, tcoffset);
2616                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2617                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2618                                                                                         _mm_setzero_si128());
2619                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2620                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2621                                                                                         _mm_setzero_si128());
2622                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2623                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2624                                         tci = _mm_madd_epi16(tci, tcoffset);
2625                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2626                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2627                                                                                         _mm_setzero_si128());
2628                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2629                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2630                                                                                         _mm_setzero_si128());
2631                                         fracm = _mm_srli_epi16(subtc, 1);
2632                                         pix1 = _mm_add_epi16(pix1,
2633                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2634                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2635                                         pix3 = _mm_add_epi16(pix3,
2636                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2637                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2638                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2639                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2640                                         pix2 = _mm_add_epi16(pix2,
2641                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2642                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2643                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2644                                 }
2645                                 if (x <= endsub)
2646                                 {
2647                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2648                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2649                                         tci = _mm_madd_epi16(tci, tcoffset);
2650                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2651                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2652                                                                                         _mm_setzero_si128());
2653                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2654                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2655                                                                                         _mm_setzero_si128());
2656                                         fracm = _mm_srli_epi16(subtc, 1);
2657                                         pix1 = _mm_add_epi16(pix1,
2658                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2659                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2660                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2661                                         pix1 = _mm_add_epi16(pix1,
2662                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2663                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2664                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2665                                         x++;
2666                                 }
2667                         }
2668                 }
2669                 else
2670                 {
2671                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2672                         {
2673                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2674                                 {
2675                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2676                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2677                                         tci = _mm_madd_epi16(tci, tcoffset);
2678                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2679                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2680                                 }
2681                                 if (x <= endsub)
2682                                 {
2683                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2684                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2685                                         tci = _mm_madd_epi16(tci, tcoffset);
2686                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2687                                         x++;
2688                                 }
2689                         }
2690                         else
2691                         {
2692                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2693                                 {
2694                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2695                                         tci = _mm_and_si128(tci, tcmax); 
2696                                         tci = _mm_madd_epi16(tci, tcoffset);
2697                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2698                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2699                                 }
2700                                 if (x <= endsub)
2701                                 {
2702                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2703                                         tci = _mm_and_si128(tci, tcmax); 
2704                                         tci = _mm_madd_epi16(tci, tcoffset);
2705                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2706                                         x++;
2707                                 }
2708                         }
2709                 }
2710         }
2711 #endif
2712 }
2713
2714 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2715 {
2716         // TODO: IMPLEMENT
2717         memset(out4ub, 255, span->length*4);
2718 }
2719
2720 float DPSOFTRAST_SampleShadowmap(const float *vector)
2721 {
2722         // TODO: IMPLEMENT
2723         return 1.0f;
2724 }
2725
2726 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2727 {
2728         int x;
2729         int startx = span->startx;
2730         int endx = span->endx;
2731         float c[4];
2732         float data[4];
2733         float slope[4];
2734         float z;
2735         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2736         for (x = startx;x < endx;x++)
2737         {
2738                 z = zf[x];
2739                 c[0] = (data[0] + slope[0]*x) * z;
2740                 c[1] = (data[1] + slope[1]*x) * z;
2741                 c[2] = (data[2] + slope[2]*x) * z;
2742                 c[3] = (data[3] + slope[3]*x) * z;
2743                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2744                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2745                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2746                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2747         }
2748 }
2749
2750 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2751 {
2752         int x;
2753         int startx = span->startx;
2754         int endx = span->endx;
2755         float c[4];
2756         float data[4];
2757         float slope[4];
2758         float z;
2759         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2760         for (x = startx;x < endx;x++)
2761         {
2762                 z = zf[x];
2763                 c[0] = (data[0] + slope[0]*x) * z;
2764                 c[1] = (data[1] + slope[1]*x) * z;
2765                 c[2] = (data[2] + slope[2]*x) * z;
2766                 c[3] = (data[3] + slope[3]*x) * z;
2767                 out4f[x*4+0] = c[0];
2768                 out4f[x*4+1] = c[1];
2769                 out4f[x*4+2] = c[2];
2770                 out4f[x*4+3] = c[3];
2771         }
2772 }
2773
2774 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2775 {
2776         int x, startx = span->startx, endx = span->endx;
2777         float c[4], localcolor[4];
2778         localcolor[0] = subcolor[0];
2779         localcolor[1] = subcolor[1];
2780         localcolor[2] = subcolor[2];
2781         localcolor[3] = subcolor[3];
2782         for (x = startx;x < endx;x++)
2783         {
2784                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2785                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2786                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2787                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2788                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2789                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2790                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2791                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2792         }
2793 }
2794
2795 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2796 {
2797         int x, startx = span->startx, endx = span->endx;
2798         for (x = startx;x < endx;x++)
2799         {
2800                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2801                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2802                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2803                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2804         }
2805 }
2806
2807 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2808 {
2809         int x, startx = span->startx, endx = span->endx;
2810         for (x = startx;x < endx;x++)
2811         {
2812                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2813                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2814                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2815                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2816         }
2817 }
2818
2819 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2820 {
2821         int x, startx = span->startx, endx = span->endx;
2822         float a, b;
2823         for (x = startx;x < endx;x++)
2824         {
2825                 a = 1.0f - inb4f[x*4+3];
2826                 b = inb4f[x*4+3];
2827                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2828                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2829                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2830                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2831         }
2832 }
2833
2834 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2835 {
2836         int x, startx = span->startx, endx = span->endx;
2837         float localcolor[4], ilerp, lerp;
2838         localcolor[0] = color[0];
2839         localcolor[1] = color[1];
2840         localcolor[2] = color[2];
2841         localcolor[3] = color[3];
2842         ilerp = 1.0f - localcolor[3];
2843         lerp = localcolor[3];
2844         for (x = startx;x < endx;x++)
2845         {
2846                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2847                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2848                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2849                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2850         }
2851 }
2852
2853
2854
2855 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2856 {
2857 #ifdef SSE2_PRESENT
2858         int x;
2859         int startx = span->startx;
2860         int endx = span->endx;
2861         __m128 data, slope;
2862         __m128 mod, endmod;
2863         __m128i submod, substep, endsubmod;
2864         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2865         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2866         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2867         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2868         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2869         for (x = startx; x < endx;)
2870         {
2871                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2872                 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2873                 if(nextsub >= endx)
2874                 {
2875                         nextsub = endsub = endx-1;
2876                         if(x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2877                 }
2878                 mod = endmod;
2879                 submod = endsubmod;
2880                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2881                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2882                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2883                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2884                 substep = _mm_packs_epi32(substep, substep);
2885                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2886                 {
2887                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2888                         pix = _mm_mulhi_epu16(pix, submod);
2889                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2890                 }
2891                 if (x <= endsub)
2892                 {
2893                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2894                         pix = _mm_mulhi_epu16(pix, submod);
2895                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2896                         x++;
2897                 }
2898         }
2899 #endif
2900 }
2901
2902 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2903 {
2904 #ifdef SSE2_PRESENT
2905         int x;
2906         int startx = span->startx;
2907         int endx = span->endx;
2908         __m128 data, slope;
2909         __m128 mod, endmod;
2910         __m128i submod, substep, endsubmod;
2911         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2912         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2913         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2914         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2915         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2916         for (x = startx; x < endx;)
2917         {
2918                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2919                 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2920                 if(nextsub >= endx)
2921                 {
2922                         nextsub = endsub = endx-1;
2923                         if(x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
2924                 }
2925                 mod = endmod;
2926                 submod = endsubmod;
2927                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2928                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2929                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2930                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2931                 substep = _mm_packs_epi32(substep, substep);
2932                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2933                 {
2934                         __m128i pix = _mm_srai_epi16(submod, 4);
2935                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2936                 }
2937                 if (x <= endsub)
2938                 {
2939                         __m128i pix = _mm_srai_epi16(submod, 4);
2940                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2941                         x++;
2942                 }
2943         }
2944 #endif
2945 }
2946
2947 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
2948 {
2949 #ifdef SSE2_PRESENT
2950         int x, startx = span->startx, endx = span->endx;
2951         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
2952         localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
2953         for (x = startx;x+2 <= endx;x+=2)
2954         {
2955                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2956                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2957                 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2958                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2959         }
2960         if(x < endx)
2961         {
2962                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2963                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
2964                 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2965                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2966         }
2967 #endif
2968 }
2969
2970 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
2971 {
2972 #ifdef SSE2_PRESENT
2973         int x, startx = span->startx, endx = span->endx;
2974         for (x = startx;x+2 <= endx;x+=2)
2975         {
2976                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2977                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
2978                 pix1 = _mm_mulhi_epu16(pix1, pix2);
2979                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2980         }
2981         if(x < endx)
2982         {
2983                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2984                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
2985                 pix1 = _mm_mulhi_epu16(pix1, pix2);
2986                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2987         }
2988 #endif
2989 }
2990
2991 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
2992 {
2993 #ifdef SSE2_PRESENT
2994         int x, startx = span->startx, endx = span->endx;
2995         for (x = startx;x+2 <= endx;x+=2)
2996         {
2997                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2998                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2999                 pix1 = _mm_add_epi16(pix1, pix2);
3000                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3001         }
3002         if(x < endx)
3003         {
3004                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3005                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3006                 pix1 = _mm_add_epi16(pix1, pix2);
3007                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3008         }
3009 #endif
3010 }
3011
3012 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3013 {
3014 #ifdef SSE2_PRESENT
3015         int x, startx = span->startx, endx = span->endx;
3016         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3017         tint = _mm_shuffle_epi32(_mm_packs_epi32(tint, tint), _MM_SHUFFLE(1, 0, 1, 0));
3018         for (x = startx;x+2 <= endx;x+=2)
3019         {
3020                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3021                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3022                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3023                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3024         }
3025         if(x < endx)
3026         {
3027                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3028                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3029                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3030                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3031         }
3032 #endif
3033 }
3034
3035 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3036 {
3037 #ifdef SSE2_PRESENT
3038         int x, startx = span->startx, endx = span->endx;
3039         for (x = startx;x+2 <= endx;x+=2)
3040         {
3041                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3042                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3043                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3044                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3045                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3046         }
3047         if(x < endx)
3048         {
3049                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3050                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3051                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3052                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3053                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3054         }
3055 #endif
3056 }
3057
3058 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3059 {
3060 #ifdef SSE2_PRESENT
3061         int x, startx = span->startx, endx = span->endx;
3062         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3063         localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
3064         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3065         for (x = startx;x+2 <= endx;x+=2)
3066         {
3067                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3068                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3069                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3070         }
3071         if(x < endx)
3072         {
3073                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3074                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3075                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3076         }
3077 #endif
3078 }
3079
3080
3081
3082 void DPSOFTRAST_VertexShader_Generic(void)
3083 {
3084         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3085         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3086         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3087         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3088                 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3089 }
3090
3091 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3092 {
3093         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3094         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3095         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3096         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3097         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3098         if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3099         {
3100                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3101                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3102                 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3103                 {
3104                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3105                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3106                         {
3107                                 // multiply
3108                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3109                         }
3110                         else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3111                         {
3112                                 // add
3113                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3114                         }
3115                         else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3116                         {
3117                                 // alphablend
3118                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3119                         }
3120                 }
3121         }
3122         else
3123                 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3124         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3125 }
3126
3127
3128
3129 void DPSOFTRAST_VertexShader_PostProcess(void)
3130 {
3131         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3132         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3133         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3134 }
3135
3136 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3137 {
3138         // TODO: optimize!!  at the very least there is no reason to use texture sampling on the frame texture
3139         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3140         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3141         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3142         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3143         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3144         if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3145         {
3146                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3147                 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3148         }
3149         DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3150         if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3151         {
3152                 // TODO: implement saturation
3153         }
3154         if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3155         {
3156                 // TODO: implement gammaramps
3157         }
3158         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3159 }
3160
3161
3162
3163 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3164 {
3165         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3166 }
3167
3168 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3169 {
3170         // this is never called (because colormask is off when this shader is used)
3171         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3172         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3173         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3174         memset(buffer_FragColorbgra8, 0, span->length*4);
3175         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3176 }
3177
3178
3179
3180 void DPSOFTRAST_VertexShader_FlatColor(void)
3181 {
3182         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3183         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3184 }
3185
3186 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3187 {
3188         int x, startx = span->startx, endx = span->endx;
3189         int Color_Ambienti[4];
3190         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3191         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3192         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3193         Color_Ambienti[2] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0]*256.0f);
3194         Color_Ambienti[1] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1]*256.0f);
3195         Color_Ambienti[0] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2]*256.0f);
3196         Color_Ambienti[3] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]        *256.0f);
3197         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3198         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3199         for (x = startx;x < endx;x++)
3200         {
3201                 buffer_FragColorbgra8[x*4+0] = (buffer_texture_colorbgra8[x*4+0] * Color_Ambienti[0])>>8;
3202                 buffer_FragColorbgra8[x*4+1] = (buffer_texture_colorbgra8[x*4+1] * Color_Ambienti[1])>>8;
3203                 buffer_FragColorbgra8[x*4+2] = (buffer_texture_colorbgra8[x*4+2] * Color_Ambienti[2])>>8;
3204                 buffer_FragColorbgra8[x*4+3] = (buffer_texture_colorbgra8[x*4+3] * Color_Ambienti[3])>>8;
3205         }
3206         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3207 }
3208
3209
3210
3211 void DPSOFTRAST_VertexShader_VertexColor(void)
3212 {
3213         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3214         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3215         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3216 }
3217
3218 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3219 {
3220 #ifdef SSE2_PRESENT
3221         unsigned char * RESTRICT pixelmask = span->pixelmask;
3222         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3223         int x, startx = span->startx, endx = span->endx;
3224         __m128i Color_Ambientm, Color_Diffusem;
3225         __m128 data, slope;
3226         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3227         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3228         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3229         int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3230         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3231         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3232         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3233                 pixel = buffer_FragColorbgra8;
3234         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3235         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3236         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3237         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3238         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3239         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3240         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3241         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3242         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3243         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3244         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3245         data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3246         slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3247         for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3248         {
3249                 __m128i color, mod, pix;
3250                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3251                 {
3252                         __m128i pix2, mod2;
3253                         __m128 z = _mm_loadu_ps(&buffer_z[x]);
3254                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3255                         mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3256                         data = _mm_add_ps(data, slope);
3257                         mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3258                         data = _mm_add_ps(data, slope);
3259                         mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3260                         data = _mm_add_ps(data, slope);
3261                         mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3262                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3263                                                                   _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3264                         pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3265                                                                    _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3266                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3267                         x += 3;
3268                         continue;
3269                 }
3270                 if(!pixelmask[x])
3271                         continue;
3272                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3273                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
3274                 mod = _mm_packs_epi32(mod, mod);
3275                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3276                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3277         }
3278         if(pixel == buffer_FragColorbgra8)
3279                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3280 #endif
3281 }
3282
3283
3284
3285 void DPSOFTRAST_VertexShader_Lightmap(void)
3286 {
3287         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3288         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3289         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3290 }
3291
3292 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3293 {
3294 #ifdef SSE2_PRESENT
3295         unsigned char * RESTRICT pixelmask = span->pixelmask;
3296         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3297         int x, startx = span->startx, endx = span->endx;
3298         __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3299         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3300         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3301         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3302         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3303         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3304         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3305         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3306         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3307         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3308                 pixel = buffer_FragColorbgra8;
3309         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3310         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3311         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3312         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3313         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3314         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3315         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3316         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3317         {
3318                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3319                 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3320                 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3321                 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3322                 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3323                 for (x = startx;x < endx;x++)
3324                 {
3325                         __m128i color, lightmap, glow, pix;
3326                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3327                         {
3328                                 __m128i pix2;
3329                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3330                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3331                                 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3332                                 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3333                                                                                                         _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3334                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3335                                 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3336                                                                                                         _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3337                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3338                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3339                                 x += 3;
3340                                 continue;
3341                         }
3342                         if(!pixelmask[x])
3343                                 continue;
3344                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3345                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3346                         glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3347                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3348                         pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3349                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3350                 }
3351         }
3352         else
3353         {
3354                 for (x = startx;x < endx;x++)
3355                 {
3356                         __m128i color, lightmap, pix;
3357                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3358                         {
3359                                 __m128i pix2;
3360                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3361                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3362                                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3363                                                                           _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3364                                 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3365                                                                            _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3366                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3367                                 x += 3;
3368                                 continue;
3369                         }
3370                         if(!pixelmask[x]) 
3371                                 continue;
3372                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3373                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3374                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3375                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3376                 }
3377         }
3378         if(pixel == buffer_FragColorbgra8)
3379                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3380 #endif
3381 }
3382
3383
3384
3385 void DPSOFTRAST_VertexShader_FakeLight(void)
3386 {
3387         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3388 }
3389
3390 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3391 {
3392         // TODO: IMPLEMENT
3393         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3394         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3395         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3396         memset(buffer_FragColorbgra8, 0, span->length*4);
3397         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3398 }
3399
3400
3401
3402 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3403 {
3404         DPSOFTRAST_VertexShader_Lightmap();
3405 }
3406
3407 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3408 {
3409         DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3410         // TODO: IMPLEMENT
3411 }
3412
3413
3414
3415 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3416 {
3417         DPSOFTRAST_VertexShader_Lightmap();
3418 }
3419
3420 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3421 {
3422         DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3423         // TODO: IMPLEMENT
3424 }
3425
3426
3427
3428 void DPSOFTRAST_VertexShader_LightDirection(void)
3429 {
3430         int i;
3431         int numvertices = dpsoftrast.numvertices;
3432         float LightDir[4];
3433         float LightVector[4];
3434         float EyePosition[4];
3435         float EyeVectorModelSpace[4];
3436         float EyeVector[4];
3437         float position[4];
3438         float svector[4];
3439         float tvector[4];
3440         float normal[4];
3441         LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3442         LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3443         LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3444         LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3445         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3446         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3447         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3448         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3449         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3450         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3451         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3452         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3453         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3454         for (i = 0;i < numvertices;i++)
3455         {
3456                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3457                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3458                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3459                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3460                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3461                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3462                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3463                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3464                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3465                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3466                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3467                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3468                 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3469                 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3470                 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3471                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3472                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3473                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3474                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3475                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3476                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3477                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3478                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3479                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3480                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3481                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3482                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3483                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3484                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3485         }
3486         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3487 }
3488
3489 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3490 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3491 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3492 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3493 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3494 #define DPSOFTRAST_Vector3Normalize(v)\
3495 do\
3496 {\
3497         float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3498         if (len)\
3499         {\
3500                 len = 1.0f / len;\
3501                 v[0] *= len;\
3502                 v[1] *= len;\
3503                 v[2] *= len;\
3504         }\
3505 }\
3506 while(0)
3507
3508 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3509 {
3510         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3511         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3512         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3513         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3514         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3515         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3516         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3517         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3518         int x, startx = span->startx, endx = span->endx;
3519         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3520         float LightVectordata[4];
3521         float LightVectorslope[4];
3522         float EyeVectordata[4];
3523         float EyeVectorslope[4];
3524         float z;
3525         float diffusetex[4];
3526         float glosstex[4];
3527         float surfacenormal[4];
3528         float lightnormal[4];
3529         float eyenormal[4];
3530         float specularnormal[4];
3531         float diffuse;
3532         float specular;
3533         float SpecularPower;
3534         int d[4];
3535         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3536         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3537         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3538         Color_Glow[3] = 0.0f;
3539         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3540         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3541         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3542         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3543         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3544         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3545         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3546         Color_Pants[3] = 0.0f;
3547         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3548         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3549         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3550         Color_Shirt[3] = 0.0f;
3551         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3552         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3553         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3554         {
3555                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3556                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3557         }
3558         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3559         {
3560                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3561         }
3562         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3563         {
3564                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3565                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3566                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3567                 Color_Diffuse[3] = 0.0f;
3568                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3569                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3570                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3571                 LightColor[3] = 0.0f;
3572                 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3573                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3574                 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3575                 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3576                 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3577                 Color_Specular[3] = 0.0f;
3578                 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3579                 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3580                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3581                 for (x = startx;x < endx;x++)
3582                 {
3583                         z = buffer_z[x];
3584                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3585                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3586                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3587                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3588                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3589                         {
3590                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3591                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3592                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3593                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3594                         }
3595                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3596                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3597                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3598                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3599                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3600                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3601                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3602                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3603
3604                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3605                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3606                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3607                         DPSOFTRAST_Vector3Normalize(lightnormal);
3608
3609                         eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3610                         eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3611                         eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3612                         DPSOFTRAST_Vector3Normalize(eyenormal);
3613
3614                         specularnormal[0] = lightnormal[0] + eyenormal[0];
3615                         specularnormal[1] = lightnormal[1] + eyenormal[1];
3616                         specularnormal[2] = lightnormal[2] + eyenormal[2];
3617                         DPSOFTRAST_Vector3Normalize(specularnormal);
3618
3619                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3620                         specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3621                         specular = pow(specular, SpecularPower * glosstex[3]);
3622                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3623                         {
3624                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3625                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3626                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3627                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3628                         }
3629                         else
3630                         {
3631                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3632                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3633                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3634                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3635                         }
3636                         buffer_FragColorbgra8[x*4+0] = d[0];
3637                         buffer_FragColorbgra8[x*4+1] = d[1];
3638                         buffer_FragColorbgra8[x*4+2] = d[2];
3639                         buffer_FragColorbgra8[x*4+3] = d[3];
3640                 }
3641         }
3642         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3643         {
3644                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3645                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3646                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3647                 Color_Diffuse[3] = 0.0f;
3648                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3649                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3650                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3651                 LightColor[3] = 0.0f;
3652                 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3653                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3654                 for (x = startx;x < endx;x++)
3655                 {
3656                         z = buffer_z[x];
3657                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3658                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3659                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3660                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3661                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3662                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3663                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3664                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3665
3666                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3667                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3668                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3669                         DPSOFTRAST_Vector3Normalize(lightnormal);
3670
3671                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3672                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3673                         {
3674                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3675                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3676                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3677                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3678                         }
3679                         else
3680                         {
3681                                 d[0] = (int)(                                                + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3682                                 d[1] = (int)(                                                + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3683                                 d[2] = (int)(                                                + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3684                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3685                         }
3686                         buffer_FragColorbgra8[x*4+0] = d[0];
3687                         buffer_FragColorbgra8[x*4+1] = d[1];
3688                         buffer_FragColorbgra8[x*4+2] = d[2];
3689                         buffer_FragColorbgra8[x*4+3] = d[3];
3690                 }
3691         }
3692         else
3693         {
3694                 for (x = startx;x < endx;x++)
3695                 {
3696                         z = buffer_z[x];
3697                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3698                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3699                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3700                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3701
3702                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3703                         {
3704                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3705                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3706                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3707                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3708                         }
3709                         else
3710                         {
3711                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3712                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3713                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3714                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3715                         }
3716                         buffer_FragColorbgra8[x*4+0] = d[0];
3717                         buffer_FragColorbgra8[x*4+1] = d[1];
3718                         buffer_FragColorbgra8[x*4+2] = d[2];
3719                         buffer_FragColorbgra8[x*4+3] = d[3];
3720                 }
3721         }
3722         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3723 }
3724
3725
3726
3727 void DPSOFTRAST_VertexShader_LightSource(void)
3728 {
3729         int i;
3730         int numvertices = dpsoftrast.numvertices;
3731         float LightPosition[4];
3732         float LightVector[4];
3733         float LightVectorModelSpace[4];
3734         float EyePosition[4];
3735         float EyeVectorModelSpace[4];
3736         float EyeVector[4];
3737         float position[4];
3738         float svector[4];
3739         float tvector[4];
3740         float normal[4];
3741         LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
3742         LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
3743         LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
3744         LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
3745         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3746         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3747         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3748         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3749         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3750         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3751         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3752         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3753         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3754         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3755         for (i = 0;i < numvertices;i++)
3756         {
3757                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3758                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3759                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3760                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3761                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3762                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3763                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3764                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3765                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3766                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3767                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3768                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3769                 LightVectorModelSpace[0] = LightPosition[0] - position[0];
3770                 LightVectorModelSpace[1] = LightPosition[1] - position[1];
3771                 LightVectorModelSpace[2] = LightPosition[2] - position[2];
3772                 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
3773                 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
3774                 LightVector[2] = normal[0]  * LightVectorModelSpace[0] + normal[1]  * LightVectorModelSpace[1] + normal[2]  * LightVectorModelSpace[2];
3775                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3776                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3777                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3778                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3779                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3780                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3781                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3782                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3783                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3784                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3785                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3786                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3787                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3788                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3789         }
3790         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3791         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
3792 }
3793
3794 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3795 {
3796 #ifdef SSE2_PRESENT
3797         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3798         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3799         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3800         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3801         unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3802         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3803         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3804         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3805         int x, startx = span->startx, endx = span->endx;
3806         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3807         float CubeVectordata[4];
3808         float CubeVectorslope[4];
3809         float LightVectordata[4];
3810         float LightVectorslope[4];
3811         float EyeVectordata[4];
3812         float EyeVectorslope[4];
3813         float z;
3814         float diffusetex[4];
3815         float glosstex[4];
3816         float surfacenormal[4];
3817         float lightnormal[4];
3818         float eyenormal[4];
3819         float specularnormal[4];
3820         float diffuse;
3821         float specular;
3822         float SpecularPower;
3823         float CubeVector[4];
3824         float attenuation;
3825         int d[4];
3826         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3827         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3828         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3829         Color_Glow[3] = 0.0f;
3830         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3831         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3832         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3833         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3834         Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3835         Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3836         Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3837         Color_Diffuse[3] = 0.0f;
3838         Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3839         Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3840         Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3841         Color_Specular[3] = 0.0f;
3842         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3843         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3844         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3845         Color_Pants[3] = 0.0f;
3846         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3847         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3848         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3849         Color_Shirt[3] = 0.0f;
3850         LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3851         LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3852         LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3853         LightColor[3] = 0.0f;
3854         SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3855         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3856         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3857         DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3858         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3859         memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
3860         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3861         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3862         {
3863                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3864                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3865         }
3866         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3867                 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
3868         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3869         {
3870                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3871                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3872                 for (x = startx;x < endx;x++)
3873                 {
3874                         z = buffer_z[x];
3875                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3876                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3877                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3878                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3879                         if (attenuation < 0.01f)
3880                                 continue;
3881                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3882                         {
3883                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3884                                 if (attenuation < 0.01f)
3885                                         continue;
3886                         }
3887
3888                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3889                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3890                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3891                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3892                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3893                         {
3894                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3895                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3896                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3897                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3898                         }
3899                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3900                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3901                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3902                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3903                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3904                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3905                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3906                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3907
3908                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3909                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3910                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3911                         DPSOFTRAST_Vector3Normalize(lightnormal);
3912
3913                         eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3914                         eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3915                         eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3916                         DPSOFTRAST_Vector3Normalize(eyenormal);
3917
3918                         specularnormal[0] = lightnormal[0] + eyenormal[0];
3919                         specularnormal[1] = lightnormal[1] + eyenormal[1];
3920                         specularnormal[2] = lightnormal[2] + eyenormal[2];
3921                         DPSOFTRAST_Vector3Normalize(specularnormal);
3922
3923                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3924                         specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3925                         specular = pow(specular, SpecularPower * glosstex[3]);
3926                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3927                         {
3928                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3929                                 attenuation *= (1.0f / 255.0f);
3930                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3931                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3932                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3933                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
3934                         }
3935                         else
3936                         {
3937                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
3938                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
3939                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
3940                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
3941                         }
3942                         buffer_FragColorbgra8[x*4+0] = d[0];
3943                         buffer_FragColorbgra8[x*4+1] = d[1];
3944                         buffer_FragColorbgra8[x*4+2] = d[2];
3945                         buffer_FragColorbgra8[x*4+3] = d[3];
3946                 }
3947         }
3948         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3949         {
3950                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3951                 for (x = startx;x < endx;x++)
3952                 {
3953                         z = buffer_z[x];
3954                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3955                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3956                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3957                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3958                         if (attenuation < 0.01f)
3959                                 continue;
3960                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3961                         {
3962                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3963                                 if (attenuation < 0.01f)
3964                                         continue;
3965                         }
3966
3967                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3968                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3969                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3970                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3971                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3972                         {
3973                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3974                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3975                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3976                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3977                         }
3978                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3979                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3980                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3981                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3982
3983                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3984                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3985                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3986                         DPSOFTRAST_Vector3Normalize(lightnormal);
3987
3988                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3989                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3990                         {
3991                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3992                                 attenuation *= (1.0f / 255.0f);
3993                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3994                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3995                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3996                                 d[3] = (int)( diffusetex[3]                                                                                                                   );if (d[3] > 255) d[3] = 255;
3997                         }
3998                         else
3999                         {
4000                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4001                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4002                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4003                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4004                         }
4005                         buffer_FragColorbgra8[x*4+0] = d[0];
4006                         buffer_FragColorbgra8[x*4+1] = d[1];
4007                         buffer_FragColorbgra8[x*4+2] = d[2];
4008                         buffer_FragColorbgra8[x*4+3] = d[3];
4009                 }
4010         }
4011         else
4012         {
4013                 for (x = startx;x < endx;x++)
4014                 {
4015                         z = buffer_z[x];
4016                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4017                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4018                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4019                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4020                         if (attenuation < 0.01f)
4021                                 continue;
4022                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4023                         {
4024                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4025                                 if (attenuation < 0.01f)
4026                                         continue;
4027                         }
4028
4029                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4030                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4031                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4032                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4033                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4034                         {
4035                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4036                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4037                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4038                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4039                         }
4040                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4041                         {
4042                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4043                                 attenuation *= (1.0f / 255.0f);
4044                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4045                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4046                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4047                                 d[3] = (int)( diffusetex[3]                                                                                      );if (d[3] > 255) d[3] = 255;
4048                         }
4049                         else
4050                         {
4051                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4052                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4053                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4054                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4055                         }
4056                         buffer_FragColorbgra8[x*4+0] = d[0];
4057                         buffer_FragColorbgra8[x*4+1] = d[1];
4058                         buffer_FragColorbgra8[x*4+2] = d[2];
4059                         buffer_FragColorbgra8[x*4+3] = d[3];
4060                 }
4061         }
4062         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4063 #endif
4064 }
4065
4066
4067
4068 void DPSOFTRAST_VertexShader_Refraction(void)
4069 {
4070         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4071 }
4072
4073 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4074 {
4075         // TODO: IMPLEMENT
4076         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4077         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4078         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4079         memset(buffer_FragColorbgra8, 0, span->length*4);
4080         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4081 }
4082
4083
4084
4085 void DPSOFTRAST_VertexShader_Water(void)
4086 {
4087         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4088 }
4089
4090
4091 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4092 {
4093         // TODO: IMPLEMENT
4094         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4095         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4096         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4097         memset(buffer_FragColorbgra8, 0, span->length*4);
4098         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4099 }
4100
4101
4102
4103 void DPSOFTRAST_VertexShader_ShowDepth(void)
4104 {
4105         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4106 }
4107
4108 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4109 {
4110         // TODO: IMPLEMENT
4111         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4112         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4113         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4114         memset(buffer_FragColorbgra8, 0, span->length*4);
4115         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4116 }
4117
4118
4119
4120 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4121 {
4122         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4123 }
4124
4125 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4126 {
4127         // TODO: IMPLEMENT
4128         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4129         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4130         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4131         memset(buffer_FragColorbgra8, 0, span->length*4);
4132         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4133 }
4134
4135
4136
4137 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4138 {
4139         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4140 }
4141
4142 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4143 {
4144         // TODO: IMPLEMENT
4145         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4146         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4147         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4148         memset(buffer_FragColorbgra8, 0, span->length*4);
4149         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4150 }
4151
4152
4153
4154 typedef struct DPSOFTRAST_ShaderModeInfo_s
4155 {
4156         int lodarrayindex;
4157         void (*Vertex)(void);
4158         void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4159         unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4160         unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4161 }
4162 DPSOFTRAST_ShaderModeInfo;
4163
4164 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4165 {
4166         {2, DPSOFTRAST_VertexShader_Generic,                        DPSOFTRAST_PixelShader_Generic,                        {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4167         {2, DPSOFTRAST_VertexShader_PostProcess,                    DPSOFTRAST_PixelShader_PostProcess,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4168         {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow,                DPSOFTRAST_PixelShader_Depth_Or_Shadow,                {~0}, {~0}},
4169         {2, DPSOFTRAST_VertexShader_FlatColor,                      DPSOFTRAST_PixelShader_FlatColor,                      {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4170         {2, DPSOFTRAST_VertexShader_VertexColor,                    DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4171         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4172         {2, DPSOFTRAST_VertexShader_FakeLight,                      DPSOFTRAST_PixelShader_FakeLight,                      {~0}, {~0}},
4173         {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace,   {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4174         {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4175         {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,                 {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4176         {2, DPSOFTRAST_VertexShader_LightSource,                    DPSOFTRAST_PixelShader_LightSource,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4177         {2, DPSOFTRAST_VertexShader_Refraction,                     DPSOFTRAST_PixelShader_Refraction,                     {~0}},
4178         {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                          {~0}},
4179         {2, DPSOFTRAST_VertexShader_ShowDepth,                      DPSOFTRAST_PixelShader_ShowDepth,                      {~0}},
4180         {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,               {~0}},
4181         {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}}
4182 };
4183
4184 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4185 {
4186         int i;
4187         int x;
4188         int startx;
4189         int endx;
4190 //      unsigned int c;
4191 //      unsigned int *colorpixel;
4192         unsigned int *depthpixel;
4193         float w;
4194         float wslope;
4195         int depth;
4196         int depthslope;
4197         unsigned int d;
4198         DPSOFTRAST_State_Triangle *triangle;
4199         DPSOFTRAST_State_Span *span;
4200         unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4201         for (i = 0; i < thread->numspans; i++)
4202         {
4203                 span = &thread->spans[i];
4204                 triangle = &thread->triangles[span->triangle];
4205                 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4206                 {
4207                         wslope = triangle->w[0];
4208                         w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4209                         depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4210                         depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4211                         depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4212                         switch(thread->fb_depthfunc)
4213                         {
4214                         default:
4215                         case GL_ALWAYS:  for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = true; break;
4216                         case GL_LESS:    for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4217                         case GL_LEQUAL:  for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4218                         case GL_EQUAL:   for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4219                         case GL_GEQUAL:  for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4220                         case GL_GREATER: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4221                         case GL_NEVER:   for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = false; break;
4222                         }
4223                         //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4224                         //for (x = 0;x < span->length;x++)
4225                         //      colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4226                         // if there is no color buffer, skip pixel shader
4227                         startx = 0;
4228                         endx = span->length;
4229                         while (startx < endx && !pixelmask[startx])
4230                                 startx++;
4231                         while (endx > startx && !pixelmask[endx-1])
4232                                 endx--;
4233                         if (startx >= endx)
4234                                 continue; // no pixels to fill
4235                         span->pixelmask = pixelmask;
4236                         span->startx = startx;
4237                         span->endx = endx;
4238                         // run pixel shader if appropriate
4239                         // do this before running depthmask code, to allow the pixelshader
4240                         // to clear pixelmask values for alpha testing
4241                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4242                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4243                         if (thread->depthmask)
4244                                 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4245                                         if (pixelmask[x])
4246                                                 depthpixel[x] = d;
4247                 }
4248                 else
4249                 {
4250                         // no depth testing means we're just dealing with color...
4251                         // if there is no color buffer, skip pixel shader
4252                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4253                         {
4254                                 memset(pixelmask, 1, span->length);
4255                                 span->pixelmask = pixelmask;
4256                                 span->startx = 0;
4257                                 span->endx = span->length;
4258                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4259                         }
4260                 }
4261         }
4262         thread->numspans = 0;
4263 }
4264
4265 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4266
4267 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4268 {
4269 #ifdef SSE2_PRESENT
4270         int cullface = thread->cullface;
4271         int width = dpsoftrast.fb_width;
4272         int miny = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
4273         int maxy = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
4274         __m128i fbmin, fbmax;
4275         __m128 viewportcenter, viewportscale;
4276         int firstvertex = command->firstvertex;
4277         int numvertices = command->numvertices;
4278         int numtriangles = command->numtriangles;
4279         const int *element3i = command->element3i;
4280         const unsigned short *element3s = command->element3s;
4281         int clipped = command->clipped;
4282         int i;
4283         int j;
4284         int k;
4285         int y;
4286         int e[3];
4287         __m128i screeny;
4288         int starty, endy;
4289         int numpoints;
4290         int clipcase;
4291         float clipdist[4];
4292         __m128 triangleedge1, triangleedge2, trianglenormal;
4293         __m128 clipfrac[3];
4294         __m128 screen[4];
4295         DPSOFTRAST_State_Triangle *triangle;
4296         DPSOFTRAST_Texture *texture;
4297         if (command->starty >= maxy || command->endy <= miny)
4298         {
4299                 if (!ATOMIC_DECREMENT(command->refcount))
4300                 {
4301                         if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4302                                 MM_FREE(command->arrays);
4303                 }
4304                 return;
4305         }
4306         DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4307         fbmin = _mm_setr_epi16(0, miny, 0, miny, 0, miny, 0, miny);
4308         fbmax = _mm_sub_epi16(_mm_setr_epi16(width, maxy, width, maxy, width, maxy, width, maxy), _mm_set1_epi16(1));
4309         viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4310         viewportscale = _mm_load_ps(thread->fb_viewportscale);
4311         screen[3] = _mm_setzero_ps();
4312         clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4313         for (i = 0;i < numtriangles;i++)
4314         {
4315                 const float *screencoord4f = command->arrays;
4316                 const float *arrays = screencoord4f + numvertices*4;
4317
4318                 // generate the 3 edges of this triangle
4319                 // generate spans for the triangle - switch based on left split or right split classification of triangle
4320                 if (element3s)
4321                 {
4322                         e[0] = element3s[i*3+0] - firstvertex;
4323                         e[1] = element3s[i*3+1] - firstvertex;
4324                         e[2] = element3s[i*3+2] - firstvertex;
4325                 }
4326                 else if (element3i)
4327                 {
4328                         e[0] = element3i[i*3+0] - firstvertex;
4329                         e[1] = element3i[i*3+1] - firstvertex;
4330                         e[2] = element3i[i*3+2] - firstvertex;
4331                 }
4332                 else
4333                 {
4334                         e[0] = i*3+0;
4335                         e[1] = i*3+1;
4336                         e[2] = i*3+2;
4337                 }
4338
4339 #define SKIPBACKFACE \
4340                 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4341                 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4342                 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4343                 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4344                                                                         _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4345                 switch(cullface) \
4346                 { \
4347                 case GL_BACK: \
4348                         if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4349                                 continue; \
4350                         break; \
4351                 case GL_FRONT: \
4352                         if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4353                                 continue; \
4354                         break; \
4355                 }
4356
4357 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4358                         clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4359                         { \
4360                                 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4361                                 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4362                         }
4363 #define CLIPPEDVERTEXCOPY(k,p1) \
4364                         screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4365
4366 #define GENATTRIBCOPY(attrib, p1) \
4367                 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4368 #define GENATTRIBLERP(attrib, p1, p2) \
4369                 { \
4370                         __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4371                         attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4372                 }
4373 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4374                 switch(clipcase) \
4375                 { \
4376                 default: \
4377                 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4378                 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4379                 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4380                 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4381                 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4382                 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4383                 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4384                 }
4385
4386                 if (! clipped)
4387                         goto notclipped;
4388
4389                 // calculate distance from nearplane
4390                 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4391                 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4392                 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4393                 if (clipdist[0] >= 0.0f)
4394                 {
4395                         if (clipdist[1] >= 0.0f)
4396                         {
4397                                 if (clipdist[2] >= 0.0f)
4398                                 {
4399                                 notclipped:
4400                                         // triangle is entirely in front of nearplane
4401                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4402                                         SKIPBACKFACE;
4403                                         numpoints = 3;
4404                                         clipcase = 0;
4405                                 }
4406                                 else
4407                                 {
4408                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4409                                         SKIPBACKFACE;
4410                                         numpoints = 4;
4411                                         clipcase = 1;
4412                                 }
4413                         }
4414                         else
4415                         {
4416                                 if (clipdist[2] >= 0.0f)
4417                                 {
4418                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4419                                         SKIPBACKFACE;
4420                                         numpoints = 4;
4421                                         clipcase = 2;
4422                                 }
4423                                 else
4424                                 {
4425                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4426                                         SKIPBACKFACE;
4427                                         numpoints = 3;
4428                                         clipcase = 3;
4429                                 }
4430                         }
4431                 }
4432                 else if (clipdist[1] >= 0.0f)
4433                 {
4434                         if (clipdist[2] >= 0.0f)
4435                         {
4436                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4437                                 SKIPBACKFACE;
4438                                 numpoints = 4;
4439                                 clipcase = 4;
4440                         }
4441                         else
4442                         {
4443                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4444                                 SKIPBACKFACE;
4445                                 numpoints = 3;
4446                                 clipcase = 5;
4447                         }
4448                 }
4449                 else if (clipdist[2] >= 0.0f)
4450                 {
4451                         CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4452                         SKIPBACKFACE;
4453                         numpoints = 3;
4454                         clipcase = 6;
4455                 }
4456                 else continue; // triangle is entirely behind nearplane
4457
4458                 {
4459                         // calculate integer y coords for triangle points
4460                         __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4461                                         screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4462                                         screenmin = _mm_min_epi16(screeni, screenir),
4463                                         screenmax = _mm_max_epi16(screeni, screenir);
4464                         screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4465                         screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4466                         screenmin = _mm_max_epi16(screenmin, fbmin);
4467                         screenmax = _mm_min_epi16(screenmax, fbmax);
4468                         // skip offscreen triangles
4469                         if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4470                                 continue;
4471                         starty = _mm_extract_epi16(screenmin, 1);
4472                         endy = _mm_extract_epi16(screenmax, 1)+1;
4473                         screeny = _mm_srai_epi32(screeni, 16);
4474                 }
4475
4476                 triangle = &thread->triangles[thread->numtriangles];
4477
4478                 // calculate attribute plans for triangle data...
4479                 // okay, this triangle is going to produce spans, we'd better project
4480                 // the interpolants now (this is what gives perspective texturing),
4481                 // this consists of simply multiplying all arrays by the W coord
4482                 // (which is basically 1/Z), which will be undone per-pixel
4483                 // (multiplying by Z again) to get the perspective-correct array
4484                 // values
4485                 {
4486                         __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4487                         __m128 mipedgescale, mipdensity;
4488                         attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4489                         attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4490                         attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4491                         attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4492                         attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4493                         w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4494                         w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4495                         w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4496                         attribedge1 = _mm_sub_ss(w0, w1);
4497                         attribedge2 = _mm_sub_ss(w2, w1);
4498                         attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4499                         attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4500                         x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4501                         y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4502                         attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4503                         _mm_store_ss(&triangle->w[0], attribxslope);
4504                         _mm_store_ss(&triangle->w[1], attribyslope);
4505                         _mm_store_ss(&triangle->w[2], attriborigin);
4506                         mipedgescale = _mm_setzero_ps();
4507                         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4508                         {
4509                                 __m128 attrib0, attrib1, attrib2;
4510                                 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4511                                 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4512                                         break;
4513                                 arrays += numvertices*4;
4514                                 GENATTRIBS(attrib0, attrib1, attrib2);
4515                                 attriborigin = _mm_mul_ps(attrib1, w1);
4516                                 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4517                                 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4518                                 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4519                                 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4520                                 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4521                                 _mm_stream_ps(triangle->attribs[k][0], attribxslope);
4522                                 _mm_stream_ps(triangle->attribs[k][1], attribyslope);
4523                                 _mm_stream_ps(triangle->attribs[k][2], attriborigin);
4524                                 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4525                                 {
4526                                         mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4527                                         mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4528                                         mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4529                                         mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4530                                 }
4531                         }
4532
4533                         memset(triangle->mip, 0, sizeof(triangle->mip));
4534                         for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4535                         {
4536                                 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4537                                 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4538                                         break;
4539                                 texture = thread->texbound[texunit];
4540                                 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4541                                 {
4542                                         mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4543                                         mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4544                                         mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4545                                         mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4546                                         // this will be multiplied in the texturing routine by the texture resolution
4547                                         y = _mm_cvtss_si32(mipdensity);
4548                                         if (y > 0)
4549                                         {
4550                                                 y = (int)(log((float)y)*0.5f/M_LN2);
4551                                                 if (y > texture->mipmaps - 1)
4552                                                         y = texture->mipmaps - 1;
4553                                                 triangle->mip[texunit] = y;
4554                                         }
4555                                 }
4556                         }
4557                 }
4558
4559                 for (y = starty; y < endy;)
4560                 {
4561                         __m128 xcoords, xslope;
4562                         __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
4563                         int yccmask = _mm_movemask_epi8(ycc);
4564                         int edge0p, edge0n, edge1p, edge1n;
4565                         int nexty;
4566                         if (numpoints == 4)
4567                         {
4568                                 switch(yccmask)
4569                                 {
4570                                 default:
4571                                 case 0xFFFF: /*0000*/ y = endy; continue;
4572                                 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
4573                                 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4574                                 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
4575                                 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
4576                                 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
4577                                 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
4578                                 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
4579                                 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
4580                                 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
4581                                 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
4582                                 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
4583                                 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
4584                                 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4585                                 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
4586                                 case 0x0000: /*1111*/ y++; continue;
4587                                 }
4588                         }
4589                         else
4590                         {
4591                                 switch(yccmask)
4592                                 {
4593                                 default:
4594                                 case 0xFFFF: /*000*/ y = endy; continue;
4595                                 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
4596                                 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4597                                 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
4598                                 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
4599                                 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4600                                 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
4601                                 case 0x0000: /*111*/ y++; continue;
4602                                 }
4603                         }
4604                         ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
4605                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
4606                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
4607                         nexty = _mm_extract_epi16(ycc, 0);
4608                         if(nexty >= endy) nexty = endy-1;
4609                         if (_mm_ucomigt_ss(_mm_max_ss(screen[edge0n], screen[edge0p]), _mm_min_ss(screen[edge1n], screen[edge1p])))
4610                         {
4611                                 int tmp = edge0n;
4612                                 edge0n = edge1n;
4613                                 edge1n = tmp;
4614                                 tmp = edge0p;
4615                                 edge0p = edge1p;
4616                                 edge1p = tmp;
4617                         }
4618                         xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
4619                         xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
4620                         xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
4621                                                                 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
4622                         xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
4623                         for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
4624                         {
4625                                 int startx, endx, offset;
4626                                 startx = _mm_cvtss_si32(xcoords);
4627                                 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
4628                                 if (startx < 0) startx = 0;
4629                                 if (endx > dpsoftrast.fb_width) endx = dpsoftrast.fb_width;
4630                                 if (startx >= endx) continue;
4631                                 for (offset = startx; offset < endx;)
4632                                 {
4633                                         DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
4634                                         span->triangle = thread->numtriangles;
4635                                         span->x = offset;
4636                                         span->y = y;
4637                                         span->length = endx - offset;
4638                                         if (span -> length > DPSOFTRAST_DRAW_MAXSPANLENGTH)
4639                                                 span -> length = DPSOFTRAST_DRAW_MAXSPANLENGTH;
4640                                         offset += span->length;
4641                                         if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
4642                                                 DPSOFTRAST_Draw_ProcessSpans(thread);
4643                                 }
4644                         }
4645                 }
4646
4647                 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
4648                 {
4649                         DPSOFTRAST_Draw_ProcessSpans(thread);
4650                         thread->numtriangles = 0;
4651                 }
4652         }
4653
4654         if (!ATOMIC_DECREMENT(command->refcount))
4655         {
4656                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4657                         MM_FREE(command->arrays);
4658         }
4659
4660         if (thread->numspans > 0 || thread->numtriangles > 0)
4661         {
4662                 DPSOFTRAST_Draw_ProcessSpans(thread);
4663                 thread->numtriangles = 0;
4664         }
4665 #endif
4666 }
4667
4668 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4669 {
4670         int i;
4671         int j;
4672         int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
4673         int datasize = 2*numvertices*sizeof(float[4]);
4674         DPSOFTRAST_Command_Draw *command;
4675         unsigned char *data;
4676         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4677         {
4678                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4679                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4680                         break;
4681                 datasize += numvertices*sizeof(float[4]);
4682         }
4683         if (element3s)
4684                 datasize += numtriangles*sizeof(unsigned short[3]);
4685         else if (element3i)
4686                 datasize += numtriangles*sizeof(int[3]);
4687         datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
4688         if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
4689         {
4690                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
4691                 data = (unsigned char *)MM_CALLOC(datasize, 1);
4692         }
4693         else
4694         {
4695                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
4696                 data = (unsigned char *)command + commandsize;
4697         }
4698         command->firstvertex = firstvertex;
4699         command->numvertices = numvertices;
4700         command->numtriangles = numtriangles;
4701         command->arrays = (float *)data;
4702         memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
4703         dpsoftrast.firstvertex = firstvertex;
4704         dpsoftrast.numvertices = numvertices;
4705         dpsoftrast.screencoord4f = (float *)data;
4706         data += numvertices*sizeof(float[4]);
4707         dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
4708         data += numvertices*sizeof(float[4]);
4709         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4710         {
4711                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4712                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4713                         break;
4714                 dpsoftrast.post_array4f[j] = (float *)data;
4715                 data += numvertices*sizeof(float[4]);
4716         }
4717         command->element3i = NULL;
4718         command->element3s = NULL;
4719         if (element3s)
4720         {
4721                 command->element3s = (unsigned short *)data;
4722                 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
4723         }
4724         else if (element3i)
4725         {
4726                 command->element3i = (int *)data;
4727                 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
4728         }
4729         return command;
4730 }
4731
4732 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4733 {
4734         DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
4735         DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
4736         command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
4737         command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
4738         if (command->starty >= command->endy)
4739         {
4740                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4741                         MM_FREE(command->arrays);
4742                 DPSOFTRAST_UndoCommand(command->commandsize);
4743                 return;
4744         }
4745         command->clipped = dpsoftrast.drawclipped;
4746         command->refcount = dpsoftrast.numthreads;
4747
4748 #ifdef USE_THREADS
4749         DPSOFTRAST_Draw_SyncCommands();
4750         {
4751                 int i;
4752                 int nexty = 0;
4753                 for (i = 0; i < dpsoftrast.numthreads; i++)
4754                 {
4755                         DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
4756                         int y = nexty;
4757                         nexty = ((i+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
4758                         if (command->starty < nexty && command->endy > y && thread->starving)
4759                                 SDL_CondSignal(thread->drawcond);
4760                 }
4761         }
4762 #else
4763         DPSOFTRAST_Draw_FlushThreads();
4764 #endif
4765 }
4766  
4767 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
4768 {
4769         int commandoffset = thread->commandoffset;
4770         while (commandoffset != endoffset)
4771         {
4772                 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
4773                 switch (command->opcode)
4774                 {
4775 #define INTERPCOMMAND(name) \
4776                 case DPSOFTRAST_OPCODE_##name : \
4777                         DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
4778                         commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
4779                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
4780                                 commandoffset = 0; \
4781                         break;
4782                 INTERPCOMMAND(Viewport)
4783                 INTERPCOMMAND(ClearColor)
4784                 INTERPCOMMAND(ClearDepth)
4785                 INTERPCOMMAND(ColorMask)
4786                 INTERPCOMMAND(DepthTest)
4787                 INTERPCOMMAND(ScissorTest)
4788                 INTERPCOMMAND(Scissor)
4789                 INTERPCOMMAND(BlendFunc)
4790                 INTERPCOMMAND(BlendSubtract)
4791                 INTERPCOMMAND(DepthMask)
4792                 INTERPCOMMAND(DepthFunc)
4793                 INTERPCOMMAND(DepthRange)
4794                 INTERPCOMMAND(PolygonOffset)
4795                 INTERPCOMMAND(CullFace)
4796                 INTERPCOMMAND(AlphaTest)
4797                 INTERPCOMMAND(AlphaFunc)
4798                 INTERPCOMMAND(SetTexture)
4799                 INTERPCOMMAND(SetShader)
4800                 INTERPCOMMAND(Uniform4f)
4801                 INTERPCOMMAND(UniformMatrix4f)
4802                 INTERPCOMMAND(Uniform1i)
4803
4804                 case DPSOFTRAST_OPCODE_Draw:
4805                         DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
4806                         commandoffset += command->commandsize;
4807                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
4808                                 commandoffset = 0;
4809                         thread->commandoffset = commandoffset;
4810                         break;
4811
4812                 case DPSOFTRAST_OPCODE_Reset:
4813                         commandoffset = 0;
4814                         break;
4815                 }
4816         }
4817         thread->commandoffset = commandoffset;
4818 }
4819
4820 #ifdef USE_THREADS
4821 static int DPSOFTRAST_Draw_Thread(void *data)
4822 {
4823         DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
4824         while(thread->index >= 0)
4825         {
4826                 if (thread->commandoffset != dpsoftrast.drawcommand)
4827                 {
4828                         DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);      
4829                 }
4830                 else 
4831                 {
4832                         SDL_LockMutex(thread->drawmutex);
4833                         if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
4834                         {
4835                                 if (thread->waiting) SDL_CondSignal(thread->waitcond);
4836                                 thread->starving = true;
4837                                 SDL_CondWait(thread->drawcond, thread->drawmutex);
4838                                 thread->starving = false;
4839                         }
4840                         SDL_UnlockMutex(thread->drawmutex);
4841                 }
4842         }   
4843         return 0;
4844 }
4845 #endif
4846
4847 static void DPSOFTRAST_Draw_FlushThreads(void)
4848 {
4849         DPSOFTRAST_State_Thread *thread;
4850         int i;
4851         DPSOFTRAST_Draw_SyncCommands();
4852 #ifdef USE_THREADS
4853         for (i = 0; i < dpsoftrast.numthreads; i++)
4854         {
4855                 thread = &dpsoftrast.threads[i];
4856                 if (thread->commandoffset != dpsoftrast.drawcommand)
4857                 {
4858                         SDL_LockMutex(thread->drawmutex);
4859                         if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
4860                                 SDL_CondSignal(thread->drawcond);
4861                         SDL_UnlockMutex(thread->drawmutex);
4862                 }
4863         }
4864 #endif                  
4865         for (i = 0; i < dpsoftrast.numthreads; i++)
4866         {
4867                 thread = &dpsoftrast.threads[i];
4868 #ifdef USE_THREADS
4869                 if (thread->commandoffset != dpsoftrast.drawcommand)
4870                 {
4871                         SDL_LockMutex(thread->drawmutex);
4872                         if (thread->commandoffset != dpsoftrast.drawcommand)
4873                         {
4874                                 thread->waiting = true;
4875                                 SDL_CondWait(thread->waitcond, thread->drawmutex);
4876                                 thread->waiting = false;
4877                         }
4878                         SDL_UnlockMutex(thread->drawmutex);
4879                 }
4880 #else
4881                 if (thread->commandoffset != dpsoftrast.drawcommand)
4882                         DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
4883 #endif
4884         }
4885         dpsoftrast.commandpool.usedcommands = 0;
4886 }
4887
4888 void DPSOFTRAST_Flush(void)
4889 {
4890         DPSOFTRAST_Draw_FlushThreads();
4891 }
4892
4893 void DPSOFTRAST_Finish(void)
4894 {
4895         DPSOFTRAST_Flush();
4896 }
4897
4898 void DPSOFTRAST_Init(int width, int height, int numthreads, unsigned int *colorpixels, unsigned int *depthpixels)
4899 {
4900         int i;
4901         union
4902         {
4903                 int i;
4904                 unsigned char b[4];
4905         }
4906         u;
4907         u.i = 1;
4908         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
4909         dpsoftrast.bigendian = u.b[3];
4910         dpsoftrast.fb_width = width;
4911         dpsoftrast.fb_height = height;
4912         dpsoftrast.fb_depthpixels = depthpixels;
4913         dpsoftrast.fb_colorpixels[0] = colorpixels;
4914         dpsoftrast.fb_colorpixels[1] = NULL;
4915         dpsoftrast.fb_colorpixels[1] = NULL;
4916         dpsoftrast.fb_colorpixels[1] = NULL;
4917         dpsoftrast.viewport[0] = 0;
4918         dpsoftrast.viewport[1] = 0;
4919         dpsoftrast.viewport[2] = dpsoftrast.fb_width;
4920         dpsoftrast.viewport[3] = dpsoftrast.fb_height;
4921         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
4922         dpsoftrast.texture_firstfree = 1;
4923         dpsoftrast.texture_end = 1;
4924         dpsoftrast.texture_max = 0;
4925         dpsoftrast.color[0] = 1;
4926         dpsoftrast.color[1] = 1;
4927         dpsoftrast.color[2] = 1;
4928         dpsoftrast.color[3] = 1;
4929 #ifdef USE_THREADS
4930         dpsoftrast.numthreads = bound(1, numthreads, 64);
4931 #else
4932         dpsoftrast.numthreads = 1;
4933 #endif
4934         dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
4935         for (i = 0; i < dpsoftrast.numthreads; i++)
4936         {
4937                 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
4938                 thread->index = i;
4939                 thread->cullface = GL_BACK;
4940                 thread->colormask[1] = 1;
4941                 thread->colormask[2] = 1;
4942                 thread->colormask[3] = 1;
4943                 thread->blendfunc[0] = GL_ONE;
4944                 thread->blendfunc[1] = GL_ZERO;
4945                 thread->depthmask = true;
4946                 thread->depthtest = true;
4947                 thread->depthfunc = GL_LEQUAL;
4948                 thread->scissortest = false;
4949                 thread->alphatest = false;
4950                 thread->alphafunc = GL_GREATER;
4951                 thread->alphavalue = 0.5f;
4952                 thread->viewport[0] = 0;
4953                 thread->viewport[1] = 0;
4954                 thread->viewport[2] = dpsoftrast.fb_width;
4955                 thread->viewport[3] = dpsoftrast.fb_height;
4956                 thread->scissor[0] = 0;
4957                 thread->scissor[1] = 0;
4958                 thread->scissor[2] = dpsoftrast.fb_width;
4959                 thread->scissor[3] = dpsoftrast.fb_height;
4960                 thread->depthrange[0] = 0;
4961                 thread->depthrange[1] = 1;
4962                 thread->polygonoffset[0] = 0;
4963                 thread->polygonoffset[1] = 0;
4964
4965                 thread->numspans = 0;
4966                 thread->numtriangles = 0;
4967                 thread->commandoffset = 0;
4968                 thread->waiting = false;
4969                 thread->starving = false;
4970 #ifdef USE_THREADS
4971                 thread->waitcond = SDL_CreateCond();
4972                 thread->drawcond = SDL_CreateCond();
4973                 thread->drawmutex = SDL_CreateMutex();
4974 #endif
4975
4976                 thread->validate = -1;
4977                 DPSOFTRAST_Validate(thread, -1);
4978 #ifdef USE_THREADS
4979                 thread->thread = SDL_CreateThread(DPSOFTRAST_Draw_Thread, thread);
4980 #endif
4981         }
4982 }
4983
4984 void DPSOFTRAST_Shutdown(void)
4985 {
4986         int i;
4987 #ifdef USE_THREADS
4988         if(dpsoftrast.numthreads > 0)
4989         {
4990                 DPSOFTRAST_State_Thread *thread;
4991                 for (i = 0; i < dpsoftrast.numthreads; i++)
4992                 {
4993                         thread = &dpsoftrast.threads[i];
4994                         SDL_LockMutex(thread->drawmutex);
4995                         thread->index = -1;
4996                         SDL_CondSignal(thread->drawcond);
4997                         SDL_UnlockMutex(thread->drawmutex);
4998                         SDL_WaitThread(thread->thread, NULL);
4999                         SDL_DestroyCond(thread->waitcond);
5000                         SDL_DestroyCond(thread->drawcond);
5001                         SDL_DestroyMutex(thread->drawmutex);
5002                 }
5003         }
5004 #endif
5005         for (i = 0;i < dpsoftrast.texture_end;i++)
5006                 if (dpsoftrast.texture[i].bytes)
5007                         MM_FREE(dpsoftrast.texture[i].bytes);
5008         if (dpsoftrast.texture)
5009                 free(dpsoftrast.texture);
5010         if (dpsoftrast.threads)
5011                 MM_FREE(dpsoftrast.threads);
5012         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5013 }
5014