]> git.xonotic.org Git - xonotic/darkplaces.git/blob - dpsoftrast.c
band interlacing option (vid_soft_interlace)
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "dpsoftrast.h"
7
8 #ifdef USE_SDL
9 #define USE_THREADS
10 #endif
11
12 #ifndef __cplusplus
13 typedef qboolean bool;
14 #endif
15
16 #define ALIGN_SIZE 16
17 #define ATOMIC_SIZE 32
18
19 #ifdef SSE2_PRESENT
20         #if defined(__GNUC__)
21                 #define ALIGN(var) var __attribute__((__aligned__(16)))
22                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
23                 #ifdef USE_THREADS
24                         #define MEMORY_BARRIER (_mm_sfence())
25                         //(__sync_synchronize())
26                         #define ATOMIC_COUNTER volatile int
27                         #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
28                         #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
29                         #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
30                 #endif
31         #elif defined(_MSC_VER)
32                 #define ALIGN(var) __declspec(align(16)) var
33                 #define ATOMIC(var) __declspec(align(32)) var
34                 #ifdef USE_THREADS
35                         #define MEMORY_BARRIER (_mm_sfence())
36                         //(MemoryBarrier())
37                         #define ATOMIC_COUNTER volatile LONG
38                         #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
39                         #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
40                         #define ATOMIC_ADD(counter, val) (InterlockedExchangeAdd(&(counter), (val)))
41                 #endif
42         #else
43                 #undef USE_THREADS
44                 #undef SSE2_PRESENT
45         #endif
46 #endif
47
48 #ifndef SSE2_PRESENT
49         #define ALIGN(var) var
50         #define ATOMIC(var) var
51 #endif
52
53 #ifdef USE_THREADS
54 #include <SDL.h>
55 #include <SDL_thread.h>
56 #else
57         #define MEMORY_BARRIER ((void)0)
58         #define ATOMIC_COUNTER int
59         #define ATOMIC_INCREMENT(counter) (++(counter))
60         #define ATOMIC_DECREMENT(counter) (--(counter))
61         #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
62         typedef void SDL_Thread;
63         typedef void SDL_cond;
64         typedef void SDL_mutex;
65 #endif
66
67 #ifdef SSE2_PRESENT
68 #include <emmintrin.h>
69
70 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
71
72 static void *MM_CALLOC(size_t nmemb, size_t size)
73 {
74         void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
75         if (ptr != NULL) memset(ptr, 0, nmemb*size);
76         return ptr;
77 }
78
79 #define MM_FREE _mm_free
80 #else
81 #define MM_MALLOC(size) malloc(size)
82 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
83 #define MM_FREE free
84 #endif
85
86 typedef enum DPSOFTRAST_ARRAY_e
87 {
88         DPSOFTRAST_ARRAY_POSITION,
89         DPSOFTRAST_ARRAY_COLOR,
90         DPSOFTRAST_ARRAY_TEXCOORD0,
91         DPSOFTRAST_ARRAY_TEXCOORD1,
92         DPSOFTRAST_ARRAY_TEXCOORD2,
93         DPSOFTRAST_ARRAY_TEXCOORD3,
94         DPSOFTRAST_ARRAY_TEXCOORD4,
95         DPSOFTRAST_ARRAY_TEXCOORD5,
96         DPSOFTRAST_ARRAY_TEXCOORD6,
97         DPSOFTRAST_ARRAY_TEXCOORD7,
98         DPSOFTRAST_ARRAY_TOTAL
99 }
100 DPSOFTRAST_ARRAY;
101
102 typedef struct DPSOFTRAST_Texture_s
103 {
104         int flags;
105         int width;
106         int height;
107         int depth;
108         int sides;
109         DPSOFTRAST_TEXTURE_FILTER filter;
110         int mipmaps;
111         int size;
112         ATOMIC_COUNTER binds;
113         unsigned char *bytes;
114         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
115 }
116 DPSOFTRAST_Texture;
117
118 #define COMMAND_SIZE ALIGN_SIZE
119 #define COMMAND_ALIGN(var) ALIGN(var)
120
121 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
122 {
123         unsigned char opcode;
124         unsigned short commandsize;
125 }
126 DPSOFTRAST_Command);
127
128 enum { DPSOFTRAST_OPCODE_Reset = 0 };
129
130 #define DEFCOMMAND(opcodeval, name, fields) \
131         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
132         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
133         { \
134                 unsigned char opcode; \
135                 unsigned short commandsize; \
136                 fields \
137         } DPSOFTRAST_Command_##name );
138
139 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
140 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
141
142 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
143 {
144         int freecommand;
145         int usedcommands;
146         ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
147 }
148 DPSOFTRAST_State_Command_Pool);
149
150 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
151 {
152         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
153         float w[3];
154         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
155 }
156 DPSOFTRAST_State_Triangle);
157
158 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
159         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
160         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
161                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
162                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
163 }
164 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
165         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
166         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
167         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
168         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
169         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
170         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
171         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
172         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
173 }
174                                         
175 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
176
177 typedef ALIGN(struct DPSOFTRAST_State_Span_s
178 {
179         int triangle; // triangle this span was generated by
180         int x; // framebuffer x coord
181         int y; // framebuffer y coord
182         int length; // pixel count
183         int startx; // usable range (according to pixelmask)
184         int endx; // usable range (according to pixelmask)
185         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
186 }
187 DPSOFTRAST_State_Span);
188
189 #define DPSOFTRAST_DRAW_MAXSPANS 1024
190 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
191
192 #define DPSOFTRAST_VALIDATE_FB 1
193 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
194 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
195 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
196
197 typedef enum DPSOFTRAST_BLENDMODE_e
198 {
199         DPSOFTRAST_BLENDMODE_OPAQUE,
200         DPSOFTRAST_BLENDMODE_ALPHA,
201         DPSOFTRAST_BLENDMODE_ADDALPHA,
202         DPSOFTRAST_BLENDMODE_ADD,
203         DPSOFTRAST_BLENDMODE_INVMOD,
204         DPSOFTRAST_BLENDMODE_MUL,
205         DPSOFTRAST_BLENDMODE_MUL2,
206         DPSOFTRAST_BLENDMODE_SUBALPHA,
207         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
208         DPSOFTRAST_BLENDMODE_TOTAL
209 }
210 DPSOFTRAST_BLENDMODE;
211
212 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
213 {
214         SDL_Thread *thread;
215         int index;
216         
217         int cullface;
218         int colormask[4];
219         int blendfunc[2];
220         int blendsubtract;
221         int depthmask;
222         int depthtest;
223         int depthfunc;
224         int scissortest;
225         int alphatest;
226         int alphafunc;
227         float alphavalue;
228         int viewport[4];
229         int scissor[4];
230         float depthrange[2];
231         float polygonoffset[2];
232
233         int shader_mode;
234         int shader_permutation;
235
236         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
237         
238         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
239         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
240
241         // DPSOFTRAST_VALIDATE_ flags
242         int validate;
243
244         // derived values (DPSOFTRAST_VALIDATE_FB)
245         int fb_colormask;
246         int fb_clearscissor[4];
247         ALIGN(float fb_viewportcenter[4]);
248         ALIGN(float fb_viewportscale[4]);
249
250         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
251         int fb_depthfunc;
252
253         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
254         int fb_blendmode;
255
256         // band boundaries
257         int miny1;
258         int maxy1;
259         int miny2;
260         int maxy2;
261
262         ATOMIC(volatile int commandoffset);
263
264         volatile bool waiting;
265         volatile bool starving;
266         SDL_cond *waitcond;
267         SDL_cond *drawcond;
268         SDL_mutex *drawmutex;
269
270         int numspans;
271         int numtriangles;
272         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
273         DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
274 }
275 DPSOFTRAST_State_Thread);
276
277 typedef ATOMIC(struct DPSOFTRAST_State_s
278 {
279         int fb_width;
280         int fb_height;
281         unsigned int *fb_depthpixels;
282         unsigned int *fb_colorpixels[4];
283
284         int viewport[4];
285         ALIGN(float fb_viewportcenter[4]);
286         ALIGN(float fb_viewportscale[4]);
287
288         float color[4];
289         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
290         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
291
292         const float *pointer_vertex3f;
293         const float *pointer_color4f;
294         const unsigned char *pointer_color4ub;
295         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
296         int stride_vertex;
297         int stride_color;
298         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
299         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
300         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
301
302         int firstvertex;
303         int numvertices;
304         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
305         float *screencoord4f;
306         int drawstarty;
307         int drawendy;
308         int drawclipped;
309         
310         int shader_mode;
311         int shader_permutation;
312
313         int texture_max;
314         int texture_end;
315         int texture_firstfree;
316         DPSOFTRAST_Texture *texture;
317
318         int bigendian;
319
320         // error reporting
321         const char *errorstring;
322
323         int interlace;
324         int numthreads;
325         DPSOFTRAST_State_Thread *threads;
326
327         ATOMIC(volatile int drawcommand);
328
329         DPSOFTRAST_State_Command_Pool commandpool;
330 }
331 DPSOFTRAST_State);
332
333 DPSOFTRAST_State dpsoftrast;
334
335 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
336 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
337 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
338 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
339 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
340
341 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
342 {
343         fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
344         fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
345         fb_viewportcenter[3] = 0.5f;
346         fb_viewportcenter[0] = 0.0f;
347         fb_viewportscale[1] = 0.5f * viewport[2];
348         fb_viewportscale[2] = -0.5f * viewport[3];
349         fb_viewportscale[3] = 0.5f;
350         fb_viewportscale[0] = 1.0f;
351 }
352
353 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
354 {
355         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
356         // and viewport projection values
357         int x1, x2;
358         int y1, y2;
359         x1 = thread->scissor[0];
360         x2 = thread->scissor[0] + thread->scissor[2];
361         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
362         y2 = dpsoftrast.fb_height - thread->scissor[1];
363         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
364         if (x1 < 0) x1 = 0;
365         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
366         if (y1 < 0) y1 = 0;
367         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
368         thread->fb_clearscissor[0] = x1;
369         thread->fb_clearscissor[1] = y1;
370         thread->fb_clearscissor[2] = x2 - x1;
371         thread->fb_clearscissor[3] = y2 - y1;
372
373         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
374 }
375
376 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
377 {
378         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
379 }
380
381 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
382 {
383         if (thread->blendsubtract)
384         {
385                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
386                 {
387                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
388                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
389                 BLENDFUNC(GL_SRC_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
390                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
391                 }
392         }
393         else
394         {       
395                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
396                 {
397                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
398                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
399                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
400                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
401                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
402                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
403                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
404                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
405                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
406                 BLENDFUNC(GL_SRC_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
407                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
408                 }
409         }
410 }
411
412 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
413
414 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
415 {
416         mask &= thread->validate;
417         if (!mask)
418                 return;
419         if (mask & DPSOFTRAST_VALIDATE_FB)
420         {
421                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
422                 DPSOFTRAST_RecalcFB(thread);
423         }
424         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
425         {
426                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
427                 DPSOFTRAST_RecalcDepthFunc(thread);
428         }
429         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
430         {
431                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
432                 DPSOFTRAST_RecalcBlendFunc(thread);
433         }
434 }
435
436 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
437 {
438         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
439                 return &dpsoftrast.texture[index];
440         return NULL;
441 }
442
443 static void DPSOFTRAST_Texture_Grow(void)
444 {
445         DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
446         DPSOFTRAST_State_Thread *thread;
447         int i;
448         int j;
449         DPSOFTRAST_Flush();
450         // expand texture array as needed
451         if (dpsoftrast.texture_max < 1024)
452                 dpsoftrast.texture_max = 1024;
453         else
454                 dpsoftrast.texture_max *= 2;
455         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
456         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
457                 if (dpsoftrast.texbound[i])
458                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
459         for (j = 0; j < dpsoftrast.numthreads; j++)
460         {
461                 thread = &dpsoftrast.threads[j];
462                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
463                         if (thread->texbound[i])
464                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
465         }
466 }
467
468 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
469 {
470         int w;
471         int h;
472         int d;
473         int size;
474         int s;
475         int texnum;
476         int mipmaps;
477         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
478         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
479         DPSOFTRAST_Texture *texture;
480         if (width*height*depth < 1)
481         {
482                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
483                 return 0;
484         }
485         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
486         {
487                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
488                 return 0;
489         }
490         switch(texformat)
491         {
492         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
493         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
494         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
495                 break;
496         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
497                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
498                 {
499                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
500                         return 0;
501                 }
502                 if (depth != 1)
503                 {
504                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
505                         return 0;
506                 }
507                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
508                 {
509                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
510                         return 0;
511                 }
512                 break;
513         }
514         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
515         {
516                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
517                 return 0;
518         }
519         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
520         {
521                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
522                 return 0;
523         }
524         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
525         {
526                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
527                 return 0;
528         }
529         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
530         {
531                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
532                 return 0;
533         }
534         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
535         {
536                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
537                 return 0;
538         }
539         // find first empty slot in texture array
540         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
541                 if (!dpsoftrast.texture[texnum].bytes)
542                         break;
543         dpsoftrast.texture_firstfree = texnum + 1;
544         if (dpsoftrast.texture_max <= texnum)
545                 DPSOFTRAST_Texture_Grow();
546         if (dpsoftrast.texture_end <= texnum)
547                 dpsoftrast.texture_end = texnum + 1;
548         texture = &dpsoftrast.texture[texnum];
549         memset(texture, 0, sizeof(*texture));
550         texture->flags = flags;
551         texture->width = width;
552         texture->height = height;
553         texture->depth = depth;
554         texture->sides = sides;
555         texture->binds = 0;
556         w = width;
557         h = height;
558         d = depth;
559         size = 0;
560         mipmaps = 0;
561         w = width;
562         h = height;
563         d = depth;
564         for (;;)
565         {
566                 s = w * h * d * sides * 4;
567                 texture->mipmap[mipmaps][0] = size;
568                 texture->mipmap[mipmaps][1] = s;
569                 texture->mipmap[mipmaps][2] = w;
570                 texture->mipmap[mipmaps][3] = h;
571                 texture->mipmap[mipmaps][4] = d;
572                 size += s;
573                 mipmaps++;
574                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
575                         break;
576                 if (w > 1) w >>= 1;
577                 if (h > 1) h >>= 1;
578                 if (d > 1) d >>= 1;
579         }
580         texture->mipmaps = mipmaps;
581         texture->size = size;
582
583         // allocate the pixels now
584         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
585
586         return texnum;
587 }
588 void DPSOFTRAST_Texture_Free(int index)
589 {
590         DPSOFTRAST_Texture *texture;
591         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
592         if (texture->binds)
593                 DPSOFTRAST_Flush();
594         if (texture->bytes)
595                 MM_FREE(texture->bytes);
596         texture->bytes = NULL;
597         memset(texture, 0, sizeof(*texture));
598         // adjust the free range and used range
599         if (dpsoftrast.texture_firstfree > index)
600                 dpsoftrast.texture_firstfree = index;
601         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
602                 dpsoftrast.texture_end--;
603 }
604 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
605 {
606         int i, x, y, z, w, layer0, layer1, row0, row1;
607         unsigned char *o, *i0, *i1, *i2, *i3;
608         DPSOFTRAST_Texture *texture;
609         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
610         if (texture->mipmaps <= 1)
611                 return;
612         for (i = 1;i < texture->mipmaps;i++)
613         {
614                 for (z = 0;z < texture->mipmap[i][4];z++)
615                 {
616                         layer0 = z*2;
617                         layer1 = z*2+1;
618                         if (layer1 >= texture->mipmap[i-1][4])
619                                 layer1 = texture->mipmap[i-1][4]-1;
620                         for (y = 0;y < texture->mipmap[i][3];y++)
621                         {
622                                 row0 = y*2;
623                                 row1 = y*2+1;
624                                 if (row1 >= texture->mipmap[i-1][3])
625                                         row1 = texture->mipmap[i-1][3]-1;
626                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
627                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
628                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
629                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
630                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
631                                 w = texture->mipmap[i][2];
632                                 if (layer1 > layer0)
633                                 {
634                                         if (texture->mipmap[i-1][2] > 1)
635                                         {
636                                                 // average 3D texture
637                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
638                                                 {
639                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
640                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
641                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
642                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
643                                                 }
644                                         }
645                                         else
646                                         {
647                                                 // average 3D mipmap with parent width == 1
648                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
649                                                 {
650                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
651                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
652                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
653                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
654                                                 }
655                                         }
656                                 }
657                                 else
658                                 {
659                                         if (texture->mipmap[i-1][2] > 1)
660                                         {
661                                                 // average 2D texture (common case)
662                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
663                                                 {
664                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
665                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
666                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
667                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
668                                                 }
669                                         }
670                                         else
671                                         {
672                                                 // 2D texture with parent width == 1
673                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
674                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
675                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
676                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
677                                         }
678                                 }
679                         }
680                 }
681         }
682 }
683 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
684 {
685         DPSOFTRAST_Texture *texture;
686         unsigned char *dst;
687         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
688         if (texture->binds)
689                 DPSOFTRAST_Flush();
690         dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
691         while (blockheight > 0)
692         {
693                 memcpy(dst, pixels, blockwidth * 4);
694                 pixels += blockwidth * 4;
695                 dst += texture->mipmap[0][2] * 4;
696                 blockheight--;
697         }
698         DPSOFTRAST_Texture_CalculateMipmaps(index);
699 }
700 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
701 {
702         DPSOFTRAST_Texture *texture;
703         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
704         if (texture->binds)
705                 DPSOFTRAST_Flush();
706         memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
707         DPSOFTRAST_Texture_CalculateMipmaps(index);
708 }
709 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
710 {
711         DPSOFTRAST_Texture *texture;
712         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
713         return texture->mipmap[mip][2];
714 }
715 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
716 {
717         DPSOFTRAST_Texture *texture;
718         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
719         return texture->mipmap[mip][3];
720 }
721 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
722 {
723         DPSOFTRAST_Texture *texture;
724         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
725         return texture->mipmap[mip][4];
726 }
727 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
728 {
729         DPSOFTRAST_Texture *texture;
730         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
731         if (texture->binds)
732                 DPSOFTRAST_Flush();
733         return texture->bytes + texture->mipmap[mip][0];
734 }
735 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
736 {
737         DPSOFTRAST_Texture *texture;
738         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
739         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
740         {
741                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
742                 return;
743         }
744         if (texture->binds)
745                 DPSOFTRAST_Flush();
746         texture->filter = filter;
747 }
748
749 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
750 {
751         if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
752                 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
753                 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
754                 DPSOFTRAST_Flush();
755         dpsoftrast.fb_width = width;
756         dpsoftrast.fb_height = height;
757         dpsoftrast.fb_depthpixels = depthpixels;
758         dpsoftrast.fb_colorpixels[0] = colorpixels0;
759         dpsoftrast.fb_colorpixels[1] = colorpixels1;
760         dpsoftrast.fb_colorpixels[2] = colorpixels2;
761         dpsoftrast.fb_colorpixels[3] = colorpixels3;
762 }
763
764 static void DPSOFTRAST_Draw_FlushThreads(void);
765
766 static void DPSOFTRAST_Draw_SyncCommands(void)
767 {
768         MEMORY_BARRIER;
769         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
770 }
771
772 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
773 {
774 #ifdef USE_THREADS
775         DPSOFTRAST_State_Thread *thread;
776         int i;
777         int freecommand = dpsoftrast.commandpool.freecommand;
778         int usedcommands = dpsoftrast.commandpool.usedcommands;
779         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
780                 return;
781         DPSOFTRAST_Draw_SyncCommands();
782         for(;;)
783         {
784                 int waitindex = -1;
785                 int commandoffset;
786                 usedcommands = 0;
787                 for (i = 0; i < dpsoftrast.numthreads; i++)
788                 {
789                         thread = &dpsoftrast.threads[i]; 
790                         commandoffset = freecommand - thread->commandoffset;
791                         if (commandoffset < 0)
792                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
793                         if (commandoffset > usedcommands)
794                         {
795                                 waitindex = i;
796                                 usedcommands = commandoffset;
797                         }
798                 }
799                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
800                         break;
801                 thread = &dpsoftrast.threads[waitindex];
802                 SDL_LockMutex(thread->drawmutex);
803                 if (thread->commandoffset != dpsoftrast.drawcommand)
804                 {
805                         thread->waiting = true;
806                         if (thread->starving) SDL_CondSignal(thread->drawcond);
807                         SDL_CondWait(thread->waitcond, thread->drawmutex);
808                         thread->waiting = false;
809                 }
810                 SDL_UnlockMutex(thread->drawmutex);
811         }
812         dpsoftrast.commandpool.usedcommands = usedcommands;
813 #else
814         DPSOFTRAST_Draw_FlushThreads();
815 #endif
816 }
817
818 #define DPSOFTRAST_ALIGNCOMMAND(size) \
819         ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
820 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
821         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
822
823 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
824 {
825         DPSOFTRAST_Command *command;
826         int freecommand = dpsoftrast.commandpool.freecommand;
827         int usedcommands = dpsoftrast.commandpool.usedcommands;
828         int extra = sizeof(DPSOFTRAST_Command);
829         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
830                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
831         if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
832         {
833                 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
834                 freecommand = dpsoftrast.commandpool.freecommand;
835                 usedcommands = dpsoftrast.commandpool.usedcommands;
836         }
837         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
838         {
839                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
840                 command->opcode = DPSOFTRAST_OPCODE_Reset;
841                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
842                 freecommand = 0;
843         }
844         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
845         command->opcode = opcode;
846         command->commandsize = size;
847         freecommand += size;
848         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
849                 freecommand = 0;
850         dpsoftrast.commandpool.freecommand = freecommand;
851         dpsoftrast.commandpool.usedcommands = usedcommands + size;
852         return command;
853 }
854
855 static void DPSOFTRAST_UndoCommand(int size)
856 {
857         int freecommand = dpsoftrast.commandpool.freecommand;
858         int usedcommands = dpsoftrast.commandpool.usedcommands;
859         freecommand -= size;
860         if (freecommand < 0)
861                 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
862         usedcommands -= size;
863         dpsoftrast.commandpool.freecommand = freecommand;
864         dpsoftrast.commandpool.usedcommands = usedcommands;
865 }
866                 
867 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
868 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
869 {
870         thread->viewport[0] = command->x;
871         thread->viewport[1] = command->y;
872         thread->viewport[2] = command->width;
873         thread->viewport[3] = command->height;
874         thread->validate |= DPSOFTRAST_VALIDATE_FB;
875 }
876 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
877 {
878         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
879         command->x = x;
880         command->y = y;
881         command->width = width;
882         command->height = height;
883
884         dpsoftrast.viewport[0] = x;
885         dpsoftrast.viewport[1] = y;
886         dpsoftrast.viewport[2] = width;
887         dpsoftrast.viewport[3] = height;
888         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
889 }
890
891 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
892 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
893 {
894         int i, x1, y1, x2, y2, w, h, x, y;
895         int miny1 = thread->miny1;
896         int maxy1 = thread->maxy1;
897         int miny2 = thread->miny2;
898         int maxy2 = thread->maxy2;
899         int bandy;
900         unsigned int *p;
901         unsigned int c;
902         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
903         x1 = thread->fb_clearscissor[0];
904         y1 = thread->fb_clearscissor[1];
905         x2 = thread->fb_clearscissor[0] + thread->fb_clearscissor[2];
906         y2 = thread->fb_clearscissor[1] + thread->fb_clearscissor[3];
907         if (y1 < miny1) y1 = miny1;
908         if (y2 > maxy2) y2 = maxy2;
909         w = x2 - x1;
910         h = y2 - y1;
911         if (w < 1 || h < 1)
912                 return;
913         // FIXME: honor fb_colormask?
914         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
915         for (i = 0;i < 4;i++)
916         {
917                 if (!dpsoftrast.fb_colorpixels[i])
918                         continue;
919                 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
920                 for (;y < bandy;y++)
921                 {
922                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
923                         for (x = x1;x < x2;x++)
924                                 p[x] = c;
925                 }
926         }
927 }
928 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
929 {
930         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
931         command->r = r;
932         command->g = g;
933         command->b = b;
934         command->a = a;
935 }
936
937 DEFCOMMAND(3, ClearDepth, float depth;)
938 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
939 {
940         int x1, y1, x2, y2, w, h, x, y;
941         int miny1 = thread->miny1;
942         int maxy1 = thread->maxy1;
943         int miny2 = thread->miny2;
944         int maxy2 = thread->maxy2;
945         int bandy;
946         unsigned int *p;
947         unsigned int c;
948         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
949         x1 = thread->fb_clearscissor[0];
950         y1 = thread->fb_clearscissor[1];
951         x2 = thread->fb_clearscissor[0] + thread->fb_clearscissor[2];
952         y2 = thread->fb_clearscissor[1] + thread->fb_clearscissor[3];
953         if (y1 < miny1) y1 = miny1;
954         if (y2 > maxy2) y2 = maxy2;
955         w = x2 - x1;
956         h = y2 - y1;
957         if (w < 1 || h < 1)
958                 return;
959         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
960         for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
961         for (;y < bandy;y++)
962         {
963                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
964                 for (x = x1;x < x2;x++)
965                         p[x] = c;
966         }
967 }
968 void DPSOFTRAST_ClearDepth(float d)
969 {
970         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
971         command->depth = d;
972 }
973
974 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
975 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
976 {
977         thread->colormask[0] = command->r != 0;
978         thread->colormask[1] = command->g != 0;
979         thread->colormask[2] = command->b != 0;
980         thread->colormask[3] = command->a != 0;
981         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
982 }
983 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
984 {
985         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
986         command->r = r;
987         command->g = g;
988         command->b = b;
989         command->a = a;
990 }
991
992 DEFCOMMAND(5, DepthTest, int enable;)
993 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
994 {
995         thread->depthtest = command->enable;
996         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
997 }
998 void DPSOFTRAST_DepthTest(int enable)
999 {
1000         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1001         command->enable = enable;
1002 }
1003
1004 DEFCOMMAND(6, ScissorTest, int enable;)
1005 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1006 {
1007         thread->scissortest = command->enable;
1008         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1009 }
1010 void DPSOFTRAST_ScissorTest(int enable)
1011 {
1012         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1013         command->enable = enable;
1014 }
1015
1016 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1017 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1018 {
1019         thread->scissor[0] = command->x;
1020         thread->scissor[1] = command->y;
1021         thread->scissor[2] = command->width;
1022         thread->scissor[3] = command->height;
1023         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1024 }
1025 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1026 {
1027         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1028         command->x = x;
1029         command->y = y;
1030         command->width = width;
1031         command->height = height;
1032 }
1033
1034 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1035 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1036 {
1037         thread->blendfunc[0] = command->sfactor;
1038         thread->blendfunc[1] = command->dfactor;
1039         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1040 }
1041 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1042 {
1043         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1044         command->sfactor = sfactor;
1045         command->dfactor = dfactor;
1046 }
1047
1048 DEFCOMMAND(9, BlendSubtract, int enable;)
1049 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1050 {
1051         thread->blendsubtract = command->enable;
1052         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1053 }
1054 void DPSOFTRAST_BlendSubtract(int enable)
1055 {
1056         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1057         command->enable = enable;
1058 }
1059
1060 DEFCOMMAND(10, DepthMask, int enable;)
1061 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1062 {
1063         thread->depthmask = command->enable;
1064 }
1065 void DPSOFTRAST_DepthMask(int enable)
1066 {
1067         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1068         command->enable = enable;
1069 }
1070
1071 DEFCOMMAND(11, DepthFunc, int func;)
1072 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1073 {
1074         thread->depthfunc = command->func;
1075 }
1076 void DPSOFTRAST_DepthFunc(int func)
1077 {
1078         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1079         command->func = func;
1080 }
1081
1082 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1083 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1084 {
1085         thread->depthrange[0] = command->nearval;
1086         thread->depthrange[1] = command->farval;
1087 }
1088 void DPSOFTRAST_DepthRange(float nearval, float farval)
1089 {
1090         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1091         command->nearval = nearval;
1092         command->farval = farval;
1093 }
1094
1095 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1096 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1097 {
1098         thread->polygonoffset[0] = command->alongnormal;
1099         thread->polygonoffset[1] = command->intoview;
1100 }
1101 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1102 {
1103         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1104         command->alongnormal = alongnormal;
1105         command->intoview = intoview;
1106 }
1107
1108 DEFCOMMAND(14, CullFace, int mode;)
1109 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1110 {
1111         thread->cullface = command->mode;
1112 }
1113 void DPSOFTRAST_CullFace(int mode)
1114 {
1115         DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1116         command->mode = mode;
1117 }
1118
1119 DEFCOMMAND(15, AlphaTest, int enable;)
1120 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1121 {
1122         thread->alphatest = command->enable;
1123 }
1124 void DPSOFTRAST_AlphaTest(int enable)
1125 {
1126         DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1127         command->enable = enable;
1128 }
1129
1130 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1131 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1132 {
1133         thread->alphafunc = command->func;
1134         thread->alphavalue = command->ref;
1135 }
1136 void DPSOFTRAST_AlphaFunc(int func, float ref)
1137 {
1138         DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1139         command->func = func;
1140         command->ref = ref;
1141 }
1142
1143 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1144 {
1145         dpsoftrast.color[0] = r;
1146         dpsoftrast.color[1] = g;
1147         dpsoftrast.color[2] = b;
1148         dpsoftrast.color[3] = a;
1149 }
1150
1151 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1152 {
1153         int outstride = blockwidth * 4;
1154         int instride = dpsoftrast.fb_width * 4;
1155         int bx1 = blockx;
1156         int by1 = blocky;
1157         int bx2 = blockx + blockwidth;
1158         int by2 = blocky + blockheight;
1159         int bw;
1160         int bh;
1161         int x;
1162         int y;
1163         unsigned char *inpixels;
1164         unsigned char *b;
1165         unsigned char *o;
1166         DPSOFTRAST_Flush();
1167         if (bx1 < 0) bx1 = 0;
1168         if (by1 < 0) by1 = 0;
1169         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1170         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1171         bw = bx2 - bx1;
1172         bh = by2 - by1;
1173         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1174         if (dpsoftrast.bigendian)
1175         {
1176                 for (y = by1;y < by2;y++)
1177                 {
1178                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1179                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1180                         for (x = bx1;x < bx2;x++)
1181                         {
1182                                 o[0] = b[3];
1183                                 o[1] = b[2];
1184                                 o[2] = b[1];
1185                                 o[3] = b[0];
1186                                 o += 4;
1187                                 b += 4;
1188                         }
1189                 }
1190         }
1191         else
1192         {
1193                 for (y = by1;y < by2;y++)
1194                 {
1195                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1196                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1197                         memcpy(o, b, bw*4);
1198                 }
1199         }
1200
1201 }
1202 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1203 {
1204         int tx1 = tx;
1205         int ty1 = ty;
1206         int tx2 = tx + width;
1207         int ty2 = ty + height;
1208         int sx1 = sx;
1209         int sy1 = sy;
1210         int sx2 = sx + width;
1211         int sy2 = sy + height;
1212         int swidth;
1213         int sheight;
1214         int twidth;
1215         int theight;
1216         int sw;
1217         int sh;
1218         int tw;
1219         int th;
1220         int y;
1221         unsigned int *spixels;
1222         unsigned int *tpixels;
1223         DPSOFTRAST_Texture *texture;
1224         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1225         if (mip < 0 || mip >= texture->mipmaps) return;
1226         if (texture->binds)
1227                 DPSOFTRAST_Flush();
1228         spixels = dpsoftrast.fb_colorpixels[0];
1229         swidth = dpsoftrast.fb_width;
1230         sheight = dpsoftrast.fb_height;
1231         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1232         twidth = texture->mipmap[mip][2];
1233         theight = texture->mipmap[mip][3];
1234         if (tx1 < 0) tx1 = 0;
1235         if (ty1 < 0) ty1 = 0;
1236         if (tx2 > twidth) tx2 = twidth;
1237         if (ty2 > theight) ty2 = theight;
1238         if (sx1 < 0) sx1 = 0;
1239         if (sy1 < 0) sy1 = 0;
1240         if (sx2 > swidth) sx2 = swidth;
1241         if (sy2 > sheight) sy2 = sheight;
1242         tw = tx2 - tx1;
1243         th = ty2 - ty1;
1244         sw = sx2 - sx1;
1245         sh = sy2 - sy1;
1246         if (tw > sw) tw = sw;
1247         if (th > sh) th = sh;
1248         if (tw < 1 || th < 1)
1249                 return;
1250         for (y = 0;y < th;y++)
1251                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 + y) * swidth + sx1), tw*4);
1252         if (texture->mipmaps > 1)
1253                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1254 }
1255
1256 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1257 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1258 {
1259         if (thread->texbound[command->unitnum])
1260                 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1261         thread->texbound[command->unitnum] = command->texture;
1262 }
1263 void DPSOFTRAST_SetTexture(int unitnum, int index)
1264 {
1265         DPSOFTRAST_Command_SetTexture *command;
1266         DPSOFTRAST_Texture *texture;
1267         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1268         {
1269                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1270                 return;
1271         }
1272         texture = DPSOFTRAST_Texture_GetByIndex(index);
1273         if (index && !texture)
1274         {
1275                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1276                 return;
1277         }
1278
1279         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1280         command->unitnum = unitnum;
1281         command->texture = texture;
1282
1283         dpsoftrast.texbound[unitnum] = texture;
1284         ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1285 }
1286
1287 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1288 {
1289         dpsoftrast.pointer_vertex3f = vertex3f;
1290         dpsoftrast.stride_vertex = stride;
1291 }
1292 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1293 {
1294         dpsoftrast.pointer_color4f = color4f;
1295         dpsoftrast.pointer_color4ub = NULL;
1296         dpsoftrast.stride_color = stride;
1297 }
1298 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1299 {
1300         dpsoftrast.pointer_color4f = NULL;
1301         dpsoftrast.pointer_color4ub = color4ub;
1302         dpsoftrast.stride_color = stride;
1303 }
1304 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1305 {
1306         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1307         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1308         dpsoftrast.stride_texcoord[unitnum] = stride;
1309 }
1310
1311 DEFCOMMAND(18, SetShader, int mode; int permutation;)
1312 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1313 {
1314         thread->shader_mode = command->mode;
1315         thread->shader_permutation = command->permutation;
1316 }
1317 void DPSOFTRAST_SetShader(int mode, int permutation)
1318 {
1319         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1320         command->mode = mode;
1321         command->permutation = permutation;
1322
1323         dpsoftrast.shader_mode = mode;
1324         dpsoftrast.shader_permutation = permutation;
1325 }
1326
1327 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1328 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1329 {
1330         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1331 }
1332 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1333 {
1334         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1335         command->index = index;
1336         command->val[0] = v0;
1337         command->val[1] = v1;
1338         command->val[2] = v2;
1339         command->val[3] = v3;
1340
1341         dpsoftrast.uniform4f[index*4+0] = v0;
1342         dpsoftrast.uniform4f[index*4+1] = v1;
1343         dpsoftrast.uniform4f[index*4+2] = v2;
1344         dpsoftrast.uniform4f[index*4+3] = v3;
1345 }
1346 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1347 {
1348         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1349         command->index = index;
1350         memcpy(command->val, v, sizeof(command->val));
1351
1352         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1353 }
1354
1355 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1356 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1357 {
1358         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1359 }
1360 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1361 {
1362 #ifdef SSE2_PRESENT
1363         int i, index;
1364         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1365         {
1366                 __m128 m0, m1, m2, m3;
1367                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1368                 command->index = index;
1369                 if (((size_t)v)&(ALIGN_SIZE-1))
1370                 {
1371                         m0 = _mm_loadu_ps(v);
1372                         m1 = _mm_loadu_ps(v+4);
1373                         m2 = _mm_loadu_ps(v+8);
1374                         m3 = _mm_loadu_ps(v+12);
1375                 }
1376                 else
1377                 {
1378                         m0 = _mm_load_ps(v);
1379                         m1 = _mm_load_ps(v+4);
1380                         m2 = _mm_load_ps(v+8);
1381                         m3 = _mm_load_ps(v+12);
1382                 }
1383                 if (transpose)
1384                 {
1385                         __m128 t0, t1, t2, t3;
1386                         t0 = _mm_unpacklo_ps(m0, m1);
1387                         t1 = _mm_unpacklo_ps(m2, m3);
1388                         t2 = _mm_unpackhi_ps(m0, m1);
1389                         t3 = _mm_unpackhi_ps(m2, m3);
1390                         m0 = _mm_movelh_ps(t0, t1);
1391                         m1 = _mm_movehl_ps(t1, t0);
1392                         m2 = _mm_movelh_ps(t2, t3);
1393                         m3 = _mm_movehl_ps(t3, t2);                     
1394                 }
1395                 _mm_store_ps(command->val, m0);
1396                 _mm_store_ps(command->val+4, m1);
1397                 _mm_store_ps(command->val+8, m2);
1398                 _mm_store_ps(command->val+12, m3);
1399                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1400                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1401                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1402                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1403         }
1404 #endif
1405 }
1406
1407 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1408 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1409 {
1410         thread->uniform1i[command->index] = command->val;
1411 }
1412 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1413 {
1414         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1415         command->index = index;
1416         command->val = i0;
1417
1418         dpsoftrast.uniform1i[command->index] = i0;
1419 }
1420
1421 #ifdef SSE2_PRESENT
1422 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1423 {
1424         float *end = dst + size*4;
1425         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1426         {
1427                 while (dst < end)
1428                 {
1429                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1430                         dst += 4;
1431                         src += stride;
1432                 }
1433         }
1434         else
1435         {
1436                 while (dst < end)
1437                 {
1438                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1439                         dst += 4;
1440                         src += stride;
1441                 }
1442         }
1443 }
1444
1445 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1446 {
1447         float *end = dst + size*4;
1448         if (stride == sizeof(float[3]))
1449         {
1450                 float *end4 = dst + (size&~3)*4;        
1451                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1452                 {
1453                         while (dst < end4)
1454                         {
1455                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1456                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1457                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1458                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1459                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1460                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1461                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1462                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1463                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1464                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1465                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1466                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1467                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1468                                 dst += 16;
1469                                 src += 4*sizeof(float[3]);
1470                         }
1471                 }
1472                 else
1473                 {
1474                         while (dst < end4)
1475                         {
1476                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1477                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1478                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1479                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1480                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1481                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1482                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1483                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1484                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1485                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1486                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1487                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1488                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1489                                 dst += 16;
1490                                 src += 4*sizeof(float[3]);
1491                         }
1492                 }
1493         }
1494         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1495         {
1496                 while (dst < end)
1497                 {
1498                         __m128 v = _mm_loadu_ps((const float *)src);
1499                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1500                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1501                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1502                         _mm_store_ps(dst, v);
1503                         dst += 4;
1504                         src += stride;
1505                 }
1506         }
1507         else
1508         {
1509                 while (dst < end)
1510                 {
1511                         __m128 v = _mm_load_ps((const float *)src);
1512                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1513                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1514                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1515                         _mm_store_ps(dst, v);
1516                         dst += 4;
1517                         src += stride;
1518                 }
1519         }
1520 }
1521
1522 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1523 {
1524         float *end = dst + size*4;
1525         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1526         if (stride == sizeof(float[2]))
1527         {
1528                 float *end2 = dst + (size&~1)*4;
1529                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1530                 {
1531                         while (dst < end2)
1532                         {
1533                                 __m128 v = _mm_loadu_ps((const float *)src);
1534                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1535                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1536                                 dst += 8;
1537                                 src += 2*sizeof(float[2]);
1538                         }
1539                 }
1540                 else
1541                 {
1542                         while (dst < end2)
1543                         {
1544                                 __m128 v = _mm_load_ps((const float *)src);
1545                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1546                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1547                                 dst += 8;
1548                                 src += 2*sizeof(float[2]);
1549                         }
1550                 }
1551         }
1552         while (dst < end)
1553         {
1554                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1555                 dst += 4;
1556                 src += stride;
1557         }
1558 }
1559
1560 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1561 {
1562         float *end = dst + size*4;
1563         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1564         if (stride == sizeof(unsigned char[4]))
1565         {
1566                 float *end4 = dst + (size&~3)*4;
1567                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1568                 {
1569                         while (dst < end4)
1570                         {
1571                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1572                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1573                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1574                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1575                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1576                                 dst += 16;
1577                                 src += 4*sizeof(unsigned char[4]);
1578                         }
1579                 }
1580                 else
1581                 {
1582                         while (dst < end4)
1583                         {
1584                                 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1585                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1586                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1587                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1588                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1589                                 dst += 16;
1590                                 src += 4*sizeof(unsigned char[4]);
1591                         }
1592                 }
1593         }
1594         while (dst < end)
1595         {
1596                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1597                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1598                 dst += 4;
1599                 src += stride;
1600         }
1601 }
1602
1603 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1604 {
1605         float *end = dst + 4*size;
1606         __m128 v = _mm_loadu_ps(src);
1607         while (dst < end)
1608         {
1609                 _mm_store_ps(dst, v);
1610                 dst += 4;
1611         }
1612 }
1613 #endif
1614
1615 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1616 {
1617 #ifdef SSE2_PRESENT
1618         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1619         __m128 m0, m1, m2, m3;
1620         float *end;
1621         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1622         {
1623                 // fast case for identity matrix
1624                 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1625                 return;
1626         }
1627         end = out4f + numitems*4;
1628         m0 = _mm_loadu_ps(inmatrix16f);
1629         m1 = _mm_loadu_ps(inmatrix16f + 4);
1630         m2 = _mm_loadu_ps(inmatrix16f + 8);
1631         m3 = _mm_loadu_ps(inmatrix16f + 12);
1632         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1633         {
1634                 while (out4f < end)
1635                 {
1636                         __m128 v = _mm_loadu_ps(in4f);
1637                         _mm_store_ps(out4f,
1638                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1639                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1640                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1641                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1642                         out4f += 4;
1643                         in4f += 4;
1644                 }
1645         }
1646         else
1647         {
1648                 while (out4f < end)
1649                 {
1650                         __m128 v = _mm_load_ps(in4f);
1651                         _mm_store_ps(out4f,
1652                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1653                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1654                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1655                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1656                         out4f += 4;
1657                         in4f += 4;
1658                 }
1659         }
1660 #endif
1661 }
1662
1663 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1664 {
1665         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1666 }
1667
1668 #ifdef SSE2_PRESENT
1669 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1670 { \
1671         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1672         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1673         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1674         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1675 }
1676
1677 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1678 { \
1679         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1680         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1681         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1682         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1683 }
1684
1685 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1686 { \
1687         __m128 p = (in); \
1688         out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1689                                                   _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1690                                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1691                                                                                         _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1692 }
1693
1694 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, __m128 minpos, __m128 maxpos, __m128 viewportcenter, __m128 viewportscale, __m128 m0, __m128 m1, __m128 m2, __m128 m3)
1695 {
1696         int clipmask = 0xFF;
1697         __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1698         m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1699         m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1700         m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1701         m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1702         #define BBFRONT(k, pos) \
1703         { \
1704                 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1705                 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1706                 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1707                 { \
1708                         __m128 proj; \
1709                         clipmask &= ~(1<<k); \
1710                         proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1711                         minproj = _mm_min_ss(minproj, proj); \
1712                         maxproj = _mm_max_ss(maxproj, proj); \
1713                 } \
1714         }
1715         BBFRONT(0, minpos); 
1716         BBFRONT(1, _mm_move_ss(minpos, maxpos)); 
1717         BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1718         BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1719         BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1720         BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1721         BBFRONT(6, _mm_move_ss(maxpos, minpos)); 
1722         BBFRONT(7, maxpos);
1723         #define BBCLIP(k) \
1724         { \
1725                 if (clipmask&(1<<k)) \
1726                 { \
1727                         if (!(clipmask&(1<<(k^1)))) \
1728                         { \
1729                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1730                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1731                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1732                                 minproj = _mm_min_ss(minproj, proj); \
1733                                 maxproj = _mm_max_ss(maxproj, proj); \
1734                         } \
1735                         if (!(clipmask&(1<<(k^2)))) \
1736                         { \
1737                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1738                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1739                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1740                                 minproj = _mm_min_ss(minproj, proj); \
1741                                 maxproj = _mm_max_ss(maxproj, proj); \
1742                         } \
1743                         if (!(clipmask&(1<<(k^4)))) \
1744                         { \
1745                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1746                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1747                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1748                                 minproj = _mm_min_ss(minproj, proj); \
1749                                 maxproj = _mm_max_ss(maxproj, proj); \
1750                         } \
1751                 } \
1752         }
1753         BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1754         viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1755         viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1756         minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1757         maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1758         minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1759         maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1760         *starty = _mm_cvttss_si32(maxproj);
1761         *endy = _mm_cvttss_si32(minproj)+1;
1762         return clipmask;
1763 }
1764 #endif
1765         
1766 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1767 {
1768 #ifdef SSE2_PRESENT
1769         float *end = out4f + numitems*4;
1770         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1771         __m128 minpos, maxpos;
1772         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1773         {
1774                 minpos = maxpos = _mm_loadu_ps(in4f);
1775                 while (out4f < end)
1776                 {
1777                         __m128 v = _mm_loadu_ps(in4f);
1778                         minpos = _mm_min_ps(minpos, v);
1779                         maxpos = _mm_max_ps(maxpos, v);
1780                         _mm_store_ps(out4f, v);
1781                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1782                         _mm_store_ps(screen4f, v);
1783                         in4f += 4;
1784                         out4f += 4;
1785                         screen4f += 4;
1786                 }
1787         }
1788         else
1789         {
1790                 minpos = maxpos = _mm_load_ps(in4f);
1791                 while (out4f < end)
1792                 {
1793                         __m128 v = _mm_load_ps(in4f);
1794                         minpos = _mm_min_ps(minpos, v);
1795                         maxpos = _mm_max_ps(maxpos, v);
1796                         _mm_store_ps(out4f, v);
1797                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1798                         _mm_store_ps(screen4f, v);
1799                         in4f += 4;
1800                         out4f += 4;
1801                         screen4f += 4;
1802                 }
1803         }
1804         if (starty && endy) 
1805                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, 
1806                                         _mm_setr_ps(1.0f, 0.0f, 0.0f, 0.0f),
1807                                         _mm_setr_ps(0.0f, 1.0f, 0.0f, 0.0f),
1808                                         _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f),
1809                                         _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f));
1810         return 0;
1811 #endif
1812 }
1813
1814 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1815 {
1816 #ifdef SSE2_PRESENT
1817         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1818         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1819         float *end;
1820         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1821                 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1822         end = out4f + numitems*4;
1823         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1824         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1825         m0 = _mm_loadu_ps(inmatrix16f);
1826         m1 = _mm_loadu_ps(inmatrix16f + 4);
1827         m2 = _mm_loadu_ps(inmatrix16f + 8);
1828         m3 = _mm_loadu_ps(inmatrix16f + 12);
1829         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1830         {
1831                 minpos = maxpos = _mm_loadu_ps(in4f);
1832                 while (out4f < end)
1833                 {
1834                         __m128 v = _mm_loadu_ps(in4f);
1835                         minpos = _mm_min_ps(minpos, v);
1836                         maxpos = _mm_max_ps(maxpos, v);
1837                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1838                         _mm_store_ps(out4f, v);
1839                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1840                         _mm_store_ps(screen4f, v);
1841                         in4f += 4;
1842                         out4f += 4;
1843                         screen4f += 4;
1844                 }
1845         }
1846         else
1847         {
1848                 minpos = maxpos = _mm_load_ps(in4f);
1849                 while (out4f < end)
1850                 {
1851                         __m128 v = _mm_load_ps(in4f);
1852                         minpos = _mm_min_ps(minpos, v);
1853                         maxpos = _mm_max_ps(maxpos, v);
1854                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1855                         _mm_store_ps(out4f, v);
1856                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1857                         _mm_store_ps(screen4f, v);
1858                         in4f += 4;
1859                         out4f += 4;
1860                         screen4f += 4;
1861                 }
1862         }
1863         if (starty && endy) 
1864                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, m0, m1, m2, m3); 
1865         return 0;
1866 #endif
1867 }
1868
1869 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1870 {
1871         float *outf = dpsoftrast.post_array4f[outarray];
1872         const unsigned char *inb;
1873         int firstvertex = dpsoftrast.firstvertex;
1874         int numvertices = dpsoftrast.numvertices;
1875         int stride;
1876         switch(inarray)
1877         {
1878         case DPSOFTRAST_ARRAY_POSITION:
1879                 stride = dpsoftrast.stride_vertex;
1880                 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1881                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1882                 break;
1883         case DPSOFTRAST_ARRAY_COLOR:
1884                 stride = dpsoftrast.stride_color;
1885                 if (dpsoftrast.pointer_color4f)
1886                 {
1887                         inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1888                         DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1889                 }
1890                 else if (dpsoftrast.pointer_color4ub)
1891                 {
1892                         stride = dpsoftrast.stride_color;
1893                         inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1894                         DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1895                 }
1896                 else
1897                 {
1898                         DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1899                 }
1900                 break;
1901         default:
1902                 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1903                 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1904                 {
1905                         inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1906                         switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1907                         {
1908                         case 2:
1909                                 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1910                                 break;
1911                         case 3:
1912                                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1913                                 break;
1914                         case 4:
1915                                 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1916                                 break;
1917                         }
1918                 }
1919                 break;
1920         }
1921         return outf;
1922 }
1923
1924 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1925 {
1926         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1927         DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1928         return data;
1929 }
1930
1931 #if 0
1932 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1933 {
1934         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1935         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1936         return data;
1937 }
1938 #endif
1939
1940 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1941 {
1942         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1943         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
1944         return data;
1945 }
1946
1947 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1948 {
1949         int x;
1950         int startx = span->startx;
1951         int endx = span->endx;
1952         float wslope = triangle->w[0];
1953         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1954         float endz = 1.0f / (w + wslope * startx);
1955         for (x = startx;x < endx;)
1956         {
1957                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1958                 float z = endz, dz;
1959                 if (nextsub >= endx) nextsub = endsub = endx-1;
1960                 endz = 1.0f / (w + wslope * nextsub);
1961                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1962                 for (; x <= endsub; x++, z += dz)
1963                         zf[x] = z;
1964         }
1965 }
1966
1967 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
1968 {
1969         int x;
1970         int startx = span->startx;
1971         int endx = span->endx;
1972         int d[4];
1973         float a, b;
1974         unsigned char * RESTRICT pixelmask = span->pixelmask;
1975         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1976         if (!pixel)
1977                 return;
1978         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
1979         // handle alphatest now (this affects depth writes too)
1980         if (thread->alphatest)
1981                 for (x = startx;x < endx;x++)
1982                         if (in4f[x*4+3] < 0.5f)
1983                                 pixelmask[x] = false;
1984         // FIXME: this does not handle bigendian
1985         switch(thread->fb_blendmode)
1986         {
1987         case DPSOFTRAST_BLENDMODE_OPAQUE:
1988                 for (x = startx;x < endx;x++)
1989                 {
1990                         if (!pixelmask[x])
1991                                 continue;
1992                         d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
1993                         d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
1994                         d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
1995                         d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
1996                         pixel[x*4+0] = d[0];
1997                         pixel[x*4+1] = d[1];
1998                         pixel[x*4+2] = d[2];
1999                         pixel[x*4+3] = d[3];
2000                 }
2001                 break;
2002         case DPSOFTRAST_BLENDMODE_ALPHA:
2003                 for (x = startx;x < endx;x++)
2004                 {
2005                         if (!pixelmask[x])
2006                                 continue;
2007                         a = in4f[x*4+3] * 255.0f;
2008                         b = 1.0f - in4f[x*4+3];
2009                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2010                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2011                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2012                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2013                         pixel[x*4+0] = d[0];
2014                         pixel[x*4+1] = d[1];
2015                         pixel[x*4+2] = d[2];
2016                         pixel[x*4+3] = d[3];
2017                 }
2018                 break;
2019         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2020                 for (x = startx;x < endx;x++)
2021                 {
2022                         if (!pixelmask[x])
2023                                 continue;
2024                         a = in4f[x*4+3] * 255.0f;
2025                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2026                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2027                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2028                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2029                         pixel[x*4+0] = d[0];
2030                         pixel[x*4+1] = d[1];
2031                         pixel[x*4+2] = d[2];
2032                         pixel[x*4+3] = d[3];
2033                 }
2034                 break;
2035         case DPSOFTRAST_BLENDMODE_ADD:
2036                 for (x = startx;x < endx;x++)
2037                 {
2038                         if (!pixelmask[x])
2039                                 continue;
2040                         d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2041                         d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2042                         d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2043                         d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2044                         pixel[x*4+0] = d[0];
2045                         pixel[x*4+1] = d[1];
2046                         pixel[x*4+2] = d[2];
2047                         pixel[x*4+3] = d[3];
2048                 }
2049                 break;
2050         case DPSOFTRAST_BLENDMODE_INVMOD:
2051                 for (x = startx;x < endx;x++)
2052                 {
2053                         if (!pixelmask[x])
2054                                 continue;
2055                         d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2056                         d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2057                         d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2058                         d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2059                         pixel[x*4+0] = d[0];
2060                         pixel[x*4+1] = d[1];
2061                         pixel[x*4+2] = d[2];
2062                         pixel[x*4+3] = d[3];
2063                 }
2064                 break;
2065         case DPSOFTRAST_BLENDMODE_MUL:
2066                 for (x = startx;x < endx;x++)
2067                 {
2068                         if (!pixelmask[x])
2069                                 continue;
2070                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2071                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2072                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2073                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2074                         pixel[x*4+0] = d[0];
2075                         pixel[x*4+1] = d[1];
2076                         pixel[x*4+2] = d[2];
2077                         pixel[x*4+3] = d[3];
2078                 }
2079                 break;
2080         case DPSOFTRAST_BLENDMODE_MUL2:
2081                 for (x = startx;x < endx;x++)
2082                 {
2083                         if (!pixelmask[x])
2084                                 continue;
2085                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2086                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2087                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2088                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2089                         pixel[x*4+0] = d[0];
2090                         pixel[x*4+1] = d[1];
2091                         pixel[x*4+2] = d[2];
2092                         pixel[x*4+3] = d[3];
2093                 }
2094                 break;
2095         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2096                 for (x = startx;x < endx;x++)
2097                 {
2098                         if (!pixelmask[x])
2099                                 continue;
2100                         a = in4f[x*4+3] * -255.0f;
2101                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2102                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2103                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2104                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2105                         pixel[x*4+0] = d[0];
2106                         pixel[x*4+1] = d[1];
2107                         pixel[x*4+2] = d[2];
2108                         pixel[x*4+3] = d[3];
2109                 }
2110                 break;
2111         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2112                 for (x = startx;x < endx;x++)
2113                 {
2114                         if (!pixelmask[x])
2115                                 continue;
2116                         a = 255.0f;
2117                         b = 1.0f - in4f[x*4+3];
2118                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2119                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2120                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2121                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2122                         pixel[x*4+0] = d[0];
2123                         pixel[x*4+1] = d[1];
2124                         pixel[x*4+2] = d[2];
2125                         pixel[x*4+3] = d[3];
2126                 }
2127                 break;
2128         }
2129 }
2130
2131 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2132 {
2133 #ifdef SSE2_PRESENT
2134         int x;
2135         int startx = span->startx;
2136         int endx = span->endx;
2137         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2138         unsigned char * RESTRICT pixelmask = span->pixelmask;
2139         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2140         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2141         if (!pixel)
2142                 return;
2143         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2144         pixeli += span->y * dpsoftrast.fb_width + span->x;
2145         // handle alphatest now (this affects depth writes too)
2146         if (thread->alphatest)
2147                 for (x = startx;x < endx;x++)
2148                         if (in4ub[x*4+3] < 0.5f)
2149                                 pixelmask[x] = false;
2150         // FIXME: this does not handle bigendian
2151         switch(thread->fb_blendmode)
2152         {
2153         case DPSOFTRAST_BLENDMODE_OPAQUE:
2154                 for (x = startx;x + 4 <= endx;)
2155                 {
2156                         if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2157                         {
2158                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2159                                 x += 4;
2160                         }
2161                         else
2162                         {
2163                                 if (pixelmask[x])
2164                                         pixeli[x] = ini[x];
2165                                 x++;
2166                         }
2167                 }
2168                 for (;x < endx;x++)
2169                         if (pixelmask[x])
2170                                 pixeli[x] = ini[x];
2171                 break;
2172         case DPSOFTRAST_BLENDMODE_ALPHA:
2173         #define FINISHBLEND(blend2, blend1) \
2174                 for (x = startx;x + 2 <= endx;x += 2) \
2175                 { \
2176                         __m128i src, dst; \
2177                         switch (*(const unsigned short*)&pixelmask[x]) \
2178                         { \
2179                         case 0x0101: \
2180                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2181                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2182                                 blend2; \
2183                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2184                                 continue; \
2185                         case 0x0100: \
2186                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2187                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2188                                 blend1; \
2189                                 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst));  \
2190                                 continue; \
2191                         case 0x0001: \
2192                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2193                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2194                                 blend1; \
2195                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2196                                 continue; \
2197                         } \
2198                         break; \
2199                 } \
2200                 for(;x < endx; x++) \
2201                 { \
2202                         __m128i src, dst; \
2203                         if (!pixelmask[x]) \
2204                                 continue; \
2205                         src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2206                         dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2207                         blend1; \
2208                         pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2209                 }
2210
2211                 FINISHBLEND({
2212                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2213                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2214                 }, {
2215                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2216                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2217                 });
2218                 break;
2219         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2220                 FINISHBLEND({
2221                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2222                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2223                 }, {
2224                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2225                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2226                 });
2227                 break;
2228         case DPSOFTRAST_BLENDMODE_ADD:
2229                 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2230                 break;
2231         case DPSOFTRAST_BLENDMODE_INVMOD:
2232                 FINISHBLEND({
2233                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2234                 }, {
2235                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2236                 });
2237                 break;
2238         case DPSOFTRAST_BLENDMODE_MUL:
2239                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2240                 break;
2241         case DPSOFTRAST_BLENDMODE_MUL2:
2242                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2243                 break;
2244         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2245                 FINISHBLEND({
2246                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2247                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2248                 }, {
2249                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2250                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2251                 });
2252                 break;
2253         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2254                 FINISHBLEND({
2255                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2256                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2257                 }, {
2258                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2259                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2260                 });
2261                 break;
2262         }
2263 #endif
2264 }
2265
2266 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2267 {
2268         int x;
2269         int startx = span->startx;
2270         int endx = span->endx;
2271         int flags;
2272         float c[4];
2273         float data[4];
2274         float slope[4];
2275         float tc[2], endtc[2];
2276         float tcscale[2];
2277         unsigned int tci[2];
2278         unsigned int tci1[2];
2279         unsigned int tcimin[2];
2280         unsigned int tcimax[2];
2281         int tciwrapmask[2];
2282         int tciwidth;
2283         int filter;
2284         int mip;
2285         const unsigned char * RESTRICT pixelbase;
2286         const unsigned char * RESTRICT pixel[4];
2287         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2288         // if no texture is bound, just fill it with white
2289         if (!texture)
2290         {
2291                 for (x = startx;x < endx;x++)
2292                 {
2293                         out4f[x*4+0] = 1.0f;
2294                         out4f[x*4+1] = 1.0f;
2295                         out4f[x*4+2] = 1.0f;
2296                         out4f[x*4+3] = 1.0f;
2297                 }
2298                 return;
2299         }
2300         mip = triangle->mip[texunitindex];
2301         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2302         // if this mipmap of the texture is 1 pixel, just fill it with that color
2303         if (texture->mipmap[mip][1] == 4)
2304         {
2305                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2306                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2307                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2308                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2309                 for (x = startx;x < endx;x++)
2310                 {
2311                         out4f[x*4+0] = c[0];
2312                         out4f[x*4+1] = c[1];
2313                         out4f[x*4+2] = c[2];
2314                         out4f[x*4+3] = c[3];
2315                 }
2316                 return;
2317         }
2318         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2319         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2320         flags = texture->flags;
2321         tcscale[0] = texture->mipmap[mip][2];
2322         tcscale[1] = texture->mipmap[mip][3];
2323         tciwidth = texture->mipmap[mip][2];
2324         tcimin[0] = 0;
2325         tcimin[1] = 0;
2326         tcimax[0] = texture->mipmap[mip][2]-1;
2327         tcimax[1] = texture->mipmap[mip][3]-1;
2328         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2329         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2330         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
2331         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
2332         for (x = startx;x < endx;)
2333         {
2334                 unsigned int subtc[2];
2335                 unsigned int substep[2];
2336                 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2337                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2338                 if (nextsub >= endx)
2339                 {
2340                         nextsub = endsub = endx-1;      
2341                         if (x < nextsub) subscale = 65536.0f / (nextsub - x);
2342                 }
2343                 tc[0] = endtc[0];
2344                 tc[1] = endtc[1];
2345                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
2346                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
2347                 substep[0] = (endtc[0] - tc[0]) * subscale;
2348                 substep[1] = (endtc[1] - tc[1]) * subscale;
2349                 subtc[0] = tc[0] * (1<<16);
2350                 subtc[1] = tc[1] * (1<<16);
2351                 if (filter)
2352                 {
2353                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2354                         {
2355                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2356                                 {
2357                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2358                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2359                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2360                                         tci[0] = subtc[0]>>16;
2361                                         tci[1] = subtc[1]>>16;
2362                                         tci1[0] = tci[0] + 1;
2363                                         tci1[1] = tci[1] + 1;
2364                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2365                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2366                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2367                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2368                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2369                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2370                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2371                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2372                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2373                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2374                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2375                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2376                                         out4f[x*4+0] = c[0];
2377                                         out4f[x*4+1] = c[1];
2378                                         out4f[x*4+2] = c[2];
2379                                         out4f[x*4+3] = c[3];
2380                                 }
2381                         }
2382                         else
2383                         {
2384                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2385                                 {
2386                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2387                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2388                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2389                                         tci[0] = subtc[0]>>16;
2390                                         tci[1] = subtc[1]>>16;
2391                                         tci1[0] = tci[0] + 1;
2392                                         tci1[1] = tci[1] + 1;
2393                                         tci[0] &= tciwrapmask[0];
2394                                         tci[1] &= tciwrapmask[1];
2395                                         tci1[0] &= tciwrapmask[0];
2396                                         tci1[1] &= tciwrapmask[1];
2397                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2398                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2399                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2400                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2401                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2402                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2403                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2404                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2405                                         out4f[x*4+0] = c[0];
2406                                         out4f[x*4+1] = c[1];
2407                                         out4f[x*4+2] = c[2];
2408                                         out4f[x*4+3] = c[3];
2409                                 }
2410                         }
2411                 }
2412                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2413                 {
2414                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2415                         {
2416                                 tci[0] = subtc[0]>>16;
2417                                 tci[1] = subtc[1]>>16;
2418                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2419                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2420                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2421                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2422                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2423                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2424                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2425                                 out4f[x*4+0] = c[0];
2426                                 out4f[x*4+1] = c[1];
2427                                 out4f[x*4+2] = c[2];
2428                                 out4f[x*4+3] = c[3];
2429                         }
2430                 }
2431                 else
2432                 {
2433                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2434                         {
2435                                 tci[0] = subtc[0]>>16;
2436                                 tci[1] = subtc[1]>>16;
2437                                 tci[0] &= tciwrapmask[0];
2438                                 tci[1] &= tciwrapmask[1];
2439                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2440                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2441                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2442                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2443                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2444                                 out4f[x*4+0] = c[0];
2445                                 out4f[x*4+1] = c[1];
2446                                 out4f[x*4+2] = c[2];
2447                                 out4f[x*4+3] = c[3];
2448                         }
2449                 }
2450         }
2451 }
2452
2453 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2454 {
2455 #ifdef SSE2_PRESENT
2456         int x;
2457         int startx = span->startx;
2458         int endx = span->endx;
2459         int flags;
2460         __m128 data, slope, tcscale;
2461         __m128i tcsize, tcmask, tcoffset, tcmax;
2462         __m128 tc, endtc;
2463         __m128i subtc, substep, endsubtc;
2464         int filter;
2465         int mip;
2466         unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2467         const unsigned char * RESTRICT pixelbase;
2468         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2469         // if no texture is bound, just fill it with white
2470         if (!texture)
2471         {
2472                 memset(out4ub + startx*4, 255, span->length*4);
2473                 return;
2474         }
2475         mip = triangle->mip[texunitindex];
2476         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2477         // if this mipmap of the texture is 1 pixel, just fill it with that color
2478         if (texture->mipmap[mip][1] == 4)
2479         {
2480                 unsigned int k = *((const unsigned int *)pixelbase);
2481                 for (x = startx;x < endx;x++)
2482                         outi[x] = k;
2483                 return;
2484         }
2485         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2486         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2487         flags = texture->flags;
2488         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2489         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2490         tcscale = _mm_cvtepi32_ps(tcsize);
2491         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2492         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2493         endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
2494         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2495         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2496         tcmax = _mm_packs_epi32(tcmask, tcmask);
2497         for (x = startx;x < endx;)
2498         {
2499                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2500                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2501                 if (nextsub >= endx)
2502                 {
2503                         nextsub = endsub = endx-1;
2504                         if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2505                 }       
2506                 tc = endtc;
2507                 subtc = endsubtc;
2508                 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
2509                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2510                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2511                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2512                 substep = _mm_slli_epi32(substep, 1);
2513                 if (filter)
2514                 {
2515                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2516                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2517                         {
2518                                 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2519                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2520                                 {
2521                                         const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;                   
2522                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2523                                         tci = _mm_madd_epi16(tci, tcoffset);
2524                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2525                                         ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2526                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2527                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2528                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2529                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2530                                         fracm = _mm_srli_epi16(subtc, 1);
2531                                         pix1 = _mm_add_epi16(pix1,
2532                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2533                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2534                                         pix3 = _mm_add_epi16(pix3,
2535                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2536                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2537                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2538                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2539                                         pix2 = _mm_add_epi16(pix2,
2540                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2541                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2542                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2543                                 }
2544                                 if (x <= endsub)
2545                                 {
2546                                         const unsigned char * RESTRICT ptr1;
2547                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2548                                         tci = _mm_madd_epi16(tci, tcoffset);
2549                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2550                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2551                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2552                                         fracm = _mm_srli_epi16(subtc, 1);
2553                                         pix1 = _mm_add_epi16(pix1,
2554                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2555                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2556                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2557                                         pix1 = _mm_add_epi16(pix1,
2558                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2559                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2560                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2561                                         x++;
2562                                 }
2563                         }
2564                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2565                         {
2566                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2567                                 {
2568                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2569                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2570                                         tci = _mm_madd_epi16(tci, tcoffset);
2571                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2572                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2573                                                                                         _mm_setzero_si128());
2574                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2575                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2576                                                                                         _mm_setzero_si128());
2577                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2578                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2579                                         tci = _mm_madd_epi16(tci, tcoffset);
2580                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2581                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2582                                                                                         _mm_setzero_si128());
2583                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2584                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2585                                                                                         _mm_setzero_si128());
2586                                         fracm = _mm_srli_epi16(subtc, 1);
2587                                         pix1 = _mm_add_epi16(pix1,
2588                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2589                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2590                                         pix3 = _mm_add_epi16(pix3,
2591                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2592                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2593                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2594                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2595                                         pix2 = _mm_add_epi16(pix2,
2596                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2597                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2598                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2599                                 }
2600                                 if (x <= endsub)
2601                                 {
2602                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2603                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2604                                         tci = _mm_madd_epi16(tci, tcoffset);
2605                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2606                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2607                                                                                         _mm_setzero_si128());
2608                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2609                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2610                                                                                         _mm_setzero_si128());
2611                                         fracm = _mm_srli_epi16(subtc, 1);
2612                                         pix1 = _mm_add_epi16(pix1,
2613                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2614                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2615                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2616                                         pix1 = _mm_add_epi16(pix1,
2617                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2618                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2619                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2620                                         x++;
2621                                 }
2622                         }
2623                         else
2624                         {
2625                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2626                                 {
2627                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2628                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2629                                         tci = _mm_madd_epi16(tci, tcoffset);
2630                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2631                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2632                                                                                         _mm_setzero_si128());
2633                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2634                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2635                                                                                         _mm_setzero_si128());
2636                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2637                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2638                                         tci = _mm_madd_epi16(tci, tcoffset);
2639                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2640                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2641                                                                                         _mm_setzero_si128());
2642                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2643                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2644                                                                                         _mm_setzero_si128());
2645                                         fracm = _mm_srli_epi16(subtc, 1);
2646                                         pix1 = _mm_add_epi16(pix1,
2647                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2648                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2649                                         pix3 = _mm_add_epi16(pix3,
2650                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2651                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2652                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2653                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2654                                         pix2 = _mm_add_epi16(pix2,
2655                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2656                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2657                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2658                                 }
2659                                 if (x <= endsub)
2660                                 {
2661                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2662                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2663                                         tci = _mm_madd_epi16(tci, tcoffset);
2664                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2665                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2666                                                                                         _mm_setzero_si128());
2667                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2668                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2669                                                                                         _mm_setzero_si128());
2670                                         fracm = _mm_srli_epi16(subtc, 1);
2671                                         pix1 = _mm_add_epi16(pix1,
2672                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2673                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2674                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2675                                         pix1 = _mm_add_epi16(pix1,
2676                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2677                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2678                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2679                                         x++;
2680                                 }
2681                         }
2682                 }
2683                 else
2684                 {
2685                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2686                         {
2687                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2688                                 {
2689                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2690                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2691                                         tci = _mm_madd_epi16(tci, tcoffset);
2692                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2693                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2694                                 }
2695                                 if (x <= endsub)
2696                                 {
2697                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2698                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2699                                         tci = _mm_madd_epi16(tci, tcoffset);
2700                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2701                                         x++;
2702                                 }
2703                         }
2704                         else
2705                         {
2706                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2707                                 {
2708                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2709                                         tci = _mm_and_si128(tci, tcmax); 
2710                                         tci = _mm_madd_epi16(tci, tcoffset);
2711                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2712                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2713                                 }
2714                                 if (x <= endsub)
2715                                 {
2716                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2717                                         tci = _mm_and_si128(tci, tcmax); 
2718                                         tci = _mm_madd_epi16(tci, tcoffset);
2719                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2720                                         x++;
2721                                 }
2722                         }
2723                 }
2724         }
2725 #endif
2726 }
2727
2728 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2729 {
2730         // TODO: IMPLEMENT
2731         memset(out4ub, 255, span->length*4);
2732 }
2733
2734 float DPSOFTRAST_SampleShadowmap(const float *vector)
2735 {
2736         // TODO: IMPLEMENT
2737         return 1.0f;
2738 }
2739
2740 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2741 {
2742         int x;
2743         int startx = span->startx;
2744         int endx = span->endx;
2745         float c[4];
2746         float data[4];
2747         float slope[4];
2748         float z;
2749         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2750         for (x = startx;x < endx;x++)
2751         {
2752                 z = zf[x];
2753                 c[0] = (data[0] + slope[0]*x) * z;
2754                 c[1] = (data[1] + slope[1]*x) * z;
2755                 c[2] = (data[2] + slope[2]*x) * z;
2756                 c[3] = (data[3] + slope[3]*x) * z;
2757                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2758                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2759                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2760                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2761         }
2762 }
2763
2764 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2765 {
2766         int x;
2767         int startx = span->startx;
2768         int endx = span->endx;
2769         float c[4];
2770         float data[4];
2771         float slope[4];
2772         float z;
2773         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2774         for (x = startx;x < endx;x++)
2775         {
2776                 z = zf[x];
2777                 c[0] = (data[0] + slope[0]*x) * z;
2778                 c[1] = (data[1] + slope[1]*x) * z;
2779                 c[2] = (data[2] + slope[2]*x) * z;
2780                 c[3] = (data[3] + slope[3]*x) * z;
2781                 out4f[x*4+0] = c[0];
2782                 out4f[x*4+1] = c[1];
2783                 out4f[x*4+2] = c[2];
2784                 out4f[x*4+3] = c[3];
2785         }
2786 }
2787
2788 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2789 {
2790         int x, startx = span->startx, endx = span->endx;
2791         float c[4], localcolor[4];
2792         localcolor[0] = subcolor[0];
2793         localcolor[1] = subcolor[1];
2794         localcolor[2] = subcolor[2];
2795         localcolor[3] = subcolor[3];
2796         for (x = startx;x < endx;x++)
2797         {
2798                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2799                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2800                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2801                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2802                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2803                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2804                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2805                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2806         }
2807 }
2808
2809 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2810 {
2811         int x, startx = span->startx, endx = span->endx;
2812         for (x = startx;x < endx;x++)
2813         {
2814                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2815                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2816                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2817                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2818         }
2819 }
2820
2821 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2822 {
2823         int x, startx = span->startx, endx = span->endx;
2824         for (x = startx;x < endx;x++)
2825         {
2826                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2827                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2828                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2829                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2830         }
2831 }
2832
2833 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2834 {
2835         int x, startx = span->startx, endx = span->endx;
2836         float a, b;
2837         for (x = startx;x < endx;x++)
2838         {
2839                 a = 1.0f - inb4f[x*4+3];
2840                 b = inb4f[x*4+3];
2841                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2842                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2843                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2844                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2845         }
2846 }
2847
2848 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2849 {
2850         int x, startx = span->startx, endx = span->endx;
2851         float localcolor[4], ilerp, lerp;
2852         localcolor[0] = color[0];
2853         localcolor[1] = color[1];
2854         localcolor[2] = color[2];
2855         localcolor[3] = color[3];
2856         ilerp = 1.0f - localcolor[3];
2857         lerp = localcolor[3];
2858         for (x = startx;x < endx;x++)
2859         {
2860                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2861                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2862                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2863                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2864         }
2865 }
2866
2867
2868
2869 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2870 {
2871 #ifdef SSE2_PRESENT
2872         int x;
2873         int startx = span->startx;
2874         int endx = span->endx;
2875         __m128 data, slope;
2876         __m128 mod, endmod;
2877         __m128i submod, substep, endsubmod;
2878         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2879         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2880         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2881         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2882         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2883         for (x = startx; x < endx;)
2884         {
2885                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2886                 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2887                 if (nextsub >= endx)
2888                 {
2889                         nextsub = endsub = endx-1;
2890                         if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2891                 }
2892                 mod = endmod;
2893                 submod = endsubmod;
2894                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2895                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2896                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2897                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2898                 substep = _mm_packs_epi32(substep, substep);
2899                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2900                 {
2901                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2902                         pix = _mm_mulhi_epu16(pix, submod);
2903                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2904                 }
2905                 if (x <= endsub)
2906                 {
2907                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2908                         pix = _mm_mulhi_epu16(pix, submod);
2909                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2910                         x++;
2911                 }
2912         }
2913 #endif
2914 }
2915
2916 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2917 {
2918 #ifdef SSE2_PRESENT
2919         int x;
2920         int startx = span->startx;
2921         int endx = span->endx;
2922         __m128 data, slope;
2923         __m128 mod, endmod;
2924         __m128i submod, substep, endsubmod;
2925         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2926         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2927         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2928         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2929         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2930         for (x = startx; x < endx;)
2931         {
2932                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2933                 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2934                 if (nextsub >= endx)
2935                 {
2936                         nextsub = endsub = endx-1;
2937                         if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
2938                 }
2939                 mod = endmod;
2940                 submod = endsubmod;
2941                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2942                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2943                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2944                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2945                 substep = _mm_packs_epi32(substep, substep);
2946                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2947                 {
2948                         __m128i pix = _mm_srai_epi16(submod, 4);
2949                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2950                 }
2951                 if (x <= endsub)
2952                 {
2953                         __m128i pix = _mm_srai_epi16(submod, 4);
2954                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2955                         x++;
2956                 }
2957         }
2958 #endif
2959 }
2960
2961 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
2962 {
2963 #ifdef SSE2_PRESENT
2964         int x, startx = span->startx, endx = span->endx;
2965         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
2966         localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
2967         for (x = startx;x+2 <= endx;x+=2)
2968         {
2969                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2970                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2971                 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2972                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2973         }
2974         if (x < endx)
2975         {
2976                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2977                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
2978                 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2979                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2980         }
2981 #endif
2982 }
2983
2984 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
2985 {
2986 #ifdef SSE2_PRESENT
2987         int x, startx = span->startx, endx = span->endx;
2988         for (x = startx;x+2 <= endx;x+=2)
2989         {
2990                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2991                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
2992                 pix1 = _mm_mulhi_epu16(pix1, pix2);
2993                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2994         }
2995         if (x < endx)
2996         {
2997                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2998                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
2999                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3000                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3001         }
3002 #endif
3003 }
3004
3005 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3006 {
3007 #ifdef SSE2_PRESENT
3008         int x, startx = span->startx, endx = span->endx;
3009         for (x = startx;x+2 <= endx;x+=2)
3010         {
3011                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3012                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3013                 pix1 = _mm_add_epi16(pix1, pix2);
3014                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3015         }
3016         if (x < endx)
3017         {
3018                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3019                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3020                 pix1 = _mm_add_epi16(pix1, pix2);
3021                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3022         }
3023 #endif
3024 }
3025
3026 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3027 {
3028 #ifdef SSE2_PRESENT
3029         int x, startx = span->startx, endx = span->endx;
3030         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3031         tint = _mm_shuffle_epi32(_mm_packs_epi32(tint, tint), _MM_SHUFFLE(1, 0, 1, 0));
3032         for (x = startx;x+2 <= endx;x+=2)
3033         {
3034                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3035                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3036                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3037                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3038         }
3039         if (x < endx)
3040         {
3041                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3042                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3043                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3044                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3045         }
3046 #endif
3047 }
3048
3049 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3050 {
3051 #ifdef SSE2_PRESENT
3052         int x, startx = span->startx, endx = span->endx;
3053         for (x = startx;x+2 <= endx;x+=2)
3054         {
3055                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3056                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3057                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3058                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3059                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3060         }
3061         if (x < endx)
3062         {
3063                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3064                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3065                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3066                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3067                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3068         }
3069 #endif
3070 }
3071
3072 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3073 {
3074 #ifdef SSE2_PRESENT
3075         int x, startx = span->startx, endx = span->endx;
3076         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3077         localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
3078         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3079         for (x = startx;x+2 <= endx;x+=2)
3080         {
3081                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3082                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3083                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3084         }
3085         if (x < endx)
3086         {
3087                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3088                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3089                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3090         }
3091 #endif
3092 }
3093
3094
3095
3096 void DPSOFTRAST_VertexShader_Generic(void)
3097 {
3098         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3099         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3100         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3101         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3102                 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3103 }
3104
3105 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3106 {
3107         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3108         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3109         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3110         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3111         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3112         if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3113         {
3114                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3115                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3116                 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3117                 {
3118                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3119                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3120                         {
3121                                 // multiply
3122                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3123                         }
3124                         else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3125                         {
3126                                 // add
3127                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3128                         }
3129                         else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3130                         {
3131                                 // alphablend
3132                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3133                         }
3134                 }
3135         }
3136         else
3137                 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3138         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3139 }
3140
3141
3142
3143 void DPSOFTRAST_VertexShader_PostProcess(void)
3144 {
3145         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3146         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3147         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3148 }
3149
3150 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3151 {
3152         // TODO: optimize!!  at the very least there is no reason to use texture sampling on the frame texture
3153         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3154         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3155         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3156         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3157         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3158         if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3159         {
3160                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3161                 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3162         }
3163         DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3164         if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3165         {
3166                 // TODO: implement saturation
3167         }
3168         if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3169         {
3170                 // TODO: implement gammaramps
3171         }
3172         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3173 }
3174
3175
3176
3177 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3178 {
3179         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3180 }
3181
3182 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3183 {
3184         // this is never called (because colormask is off when this shader is used)
3185         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3186         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3187         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3188         memset(buffer_FragColorbgra8, 0, span->length*4);
3189         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3190 }
3191
3192
3193
3194 void DPSOFTRAST_VertexShader_FlatColor(void)
3195 {
3196         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3197         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3198 }
3199
3200 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3201 {
3202         int x, startx = span->startx, endx = span->endx;
3203         int Color_Ambienti[4];
3204         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3205         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3206         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3207         Color_Ambienti[2] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0]*256.0f);
3208         Color_Ambienti[1] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1]*256.0f);
3209         Color_Ambienti[0] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2]*256.0f);
3210         Color_Ambienti[3] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]        *256.0f);
3211         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3212         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3213         for (x = startx;x < endx;x++)
3214         {
3215                 buffer_FragColorbgra8[x*4+0] = (buffer_texture_colorbgra8[x*4+0] * Color_Ambienti[0])>>8;
3216                 buffer_FragColorbgra8[x*4+1] = (buffer_texture_colorbgra8[x*4+1] * Color_Ambienti[1])>>8;
3217                 buffer_FragColorbgra8[x*4+2] = (buffer_texture_colorbgra8[x*4+2] * Color_Ambienti[2])>>8;
3218                 buffer_FragColorbgra8[x*4+3] = (buffer_texture_colorbgra8[x*4+3] * Color_Ambienti[3])>>8;
3219         }
3220         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3221 }
3222
3223
3224
3225 void DPSOFTRAST_VertexShader_VertexColor(void)
3226 {
3227         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3228         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3229         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3230 }
3231
3232 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3233 {
3234 #ifdef SSE2_PRESENT
3235         unsigned char * RESTRICT pixelmask = span->pixelmask;
3236         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3237         int x, startx = span->startx, endx = span->endx;
3238         __m128i Color_Ambientm, Color_Diffusem;
3239         __m128 data, slope;
3240         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3241         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3242         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3243         int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3244         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3245         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3246         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3247                 pixel = buffer_FragColorbgra8;
3248         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3249         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3250         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3251         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3252         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3253         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3254         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3255         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3256         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3257         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3258         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3259         data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3260         slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3261         for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3262         {
3263                 __m128i color, mod, pix;
3264                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3265                 {
3266                         __m128i pix2, mod2;
3267                         __m128 z = _mm_loadu_ps(&buffer_z[x]);
3268                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3269                         mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3270                         data = _mm_add_ps(data, slope);
3271                         mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3272                         data = _mm_add_ps(data, slope);
3273                         mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3274                         data = _mm_add_ps(data, slope);
3275                         mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3276                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3277                                                                   _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3278                         pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3279                                                                    _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3280                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3281                         x += 3;
3282                         continue;
3283                 }
3284                 if (!pixelmask[x])
3285                         continue;
3286                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3287                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
3288                 mod = _mm_packs_epi32(mod, mod);
3289                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3290                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3291         }
3292         if (pixel == buffer_FragColorbgra8)
3293                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3294 #endif
3295 }
3296
3297
3298
3299 void DPSOFTRAST_VertexShader_Lightmap(void)
3300 {
3301         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3302         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3303         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3304 }
3305
3306 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3307 {
3308 #ifdef SSE2_PRESENT
3309         unsigned char * RESTRICT pixelmask = span->pixelmask;
3310         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3311         int x, startx = span->startx, endx = span->endx;
3312         __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3313         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3314         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3315         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3316         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3317         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3318         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3319         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3320         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3321         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3322                 pixel = buffer_FragColorbgra8;
3323         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3324         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3325         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3326         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3327         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3328         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3329         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3330         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3331         {
3332                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3333                 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3334                 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3335                 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3336                 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3337                 for (x = startx;x < endx;x++)
3338                 {
3339                         __m128i color, lightmap, glow, pix;
3340                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3341                         {
3342                                 __m128i pix2;
3343                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3344                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3345                                 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3346                                 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3347                                                                                                         _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3348                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3349                                 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3350                                                                                                         _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3351                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3352                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3353                                 x += 3;
3354                                 continue;
3355                         }
3356                         if (!pixelmask[x])
3357                                 continue;
3358                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3359                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3360                         glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3361                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3362                         pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3363                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3364                 }
3365         }
3366         else
3367         {
3368                 for (x = startx;x < endx;x++)
3369                 {
3370                         __m128i color, lightmap, pix;
3371                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3372                         {
3373                                 __m128i pix2;
3374                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3375                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3376                                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3377                                                                           _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3378                                 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3379                                                                            _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3380                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3381                                 x += 3;
3382                                 continue;
3383                         }
3384                         if (!pixelmask[x]) 
3385                                 continue;
3386                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3387                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3388                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3389                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3390                 }
3391         }
3392         if (pixel == buffer_FragColorbgra8)
3393                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3394 #endif
3395 }
3396
3397
3398
3399 void DPSOFTRAST_VertexShader_FakeLight(void)
3400 {
3401         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3402 }
3403
3404 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3405 {
3406         // TODO: IMPLEMENT
3407         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3408         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3409         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3410         memset(buffer_FragColorbgra8, 0, span->length*4);
3411         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3412 }
3413
3414
3415
3416 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3417 {
3418         DPSOFTRAST_VertexShader_Lightmap();
3419 }
3420
3421 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3422 {
3423         DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3424         // TODO: IMPLEMENT
3425 }
3426
3427
3428
3429 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3430 {
3431         DPSOFTRAST_VertexShader_Lightmap();
3432 }
3433
3434 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3435 {
3436         DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3437         // TODO: IMPLEMENT
3438 }
3439
3440
3441
3442 void DPSOFTRAST_VertexShader_LightDirection(void)
3443 {
3444         int i;
3445         int numvertices = dpsoftrast.numvertices;
3446         float LightDir[4];
3447         float LightVector[4];
3448         float EyePosition[4];
3449         float EyeVectorModelSpace[4];
3450         float EyeVector[4];
3451         float position[4];
3452         float svector[4];
3453         float tvector[4];
3454         float normal[4];
3455         LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3456         LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3457         LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3458         LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3459         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3460         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3461         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3462         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3463         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3464         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3465         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3466         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3467         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3468         for (i = 0;i < numvertices;i++)
3469         {
3470                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3471                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3472                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3473                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3474                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3475                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3476                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3477                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3478                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3479                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3480                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3481                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3482                 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3483                 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3484                 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3485                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3486                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3487                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3488                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3489                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3490                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3491                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3492                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3493                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3494                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3495                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3496                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3497                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3498                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3499         }
3500         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3501 }
3502
3503 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3504 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3505 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3506 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3507 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3508 #define DPSOFTRAST_Vector3Normalize(v)\
3509 do\
3510 {\
3511         float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3512         if (len)\
3513         {\
3514                 len = 1.0f / len;\
3515                 v[0] *= len;\
3516                 v[1] *= len;\
3517                 v[2] *= len;\
3518         }\
3519 }\
3520 while(0)
3521
3522 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3523 {
3524         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3525         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3526         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3527         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3528         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3529         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3530         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3531         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3532         int x, startx = span->startx, endx = span->endx;
3533         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3534         float LightVectordata[4];
3535         float LightVectorslope[4];
3536         float EyeVectordata[4];
3537         float EyeVectorslope[4];
3538         float z;
3539         float diffusetex[4];
3540         float glosstex[4];
3541         float surfacenormal[4];
3542         float lightnormal[4];
3543         float eyenormal[4];
3544         float specularnormal[4];
3545         float diffuse;
3546         float specular;
3547         float SpecularPower;
3548         int d[4];
3549         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3550         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3551         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3552         Color_Glow[3] = 0.0f;
3553         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3554         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3555         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3556         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3557         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3558         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3559         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3560         Color_Pants[3] = 0.0f;
3561         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3562         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3563         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3564         Color_Shirt[3] = 0.0f;
3565         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3566         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3567         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3568         {
3569                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3570                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3571         }
3572         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3573         {
3574                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3575         }
3576         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3577         {
3578                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3579                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3580                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3581                 Color_Diffuse[3] = 0.0f;
3582                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3583                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3584                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3585                 LightColor[3] = 0.0f;
3586                 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3587                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3588                 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3589                 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3590                 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3591                 Color_Specular[3] = 0.0f;
3592                 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3593                 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3594                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3595                 for (x = startx;x < endx;x++)
3596                 {
3597                         z = buffer_z[x];
3598                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3599                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3600                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3601                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3602                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3603                         {
3604                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3605                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3606                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3607                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3608                         }
3609                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3610                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3611                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3612                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3613                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3614                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3615                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3616                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3617
3618                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3619                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3620                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3621                         DPSOFTRAST_Vector3Normalize(lightnormal);
3622
3623                         eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3624                         eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3625                         eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3626                         DPSOFTRAST_Vector3Normalize(eyenormal);
3627
3628                         specularnormal[0] = lightnormal[0] + eyenormal[0];
3629                         specularnormal[1] = lightnormal[1] + eyenormal[1];
3630                         specularnormal[2] = lightnormal[2] + eyenormal[2];
3631                         DPSOFTRAST_Vector3Normalize(specularnormal);
3632
3633                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3634                         specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3635                         specular = pow(specular, SpecularPower * glosstex[3]);
3636                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3637                         {
3638                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3639                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3640                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3641                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3642                         }
3643                         else
3644                         {
3645                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3646                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3647                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3648                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3649                         }
3650                         buffer_FragColorbgra8[x*4+0] = d[0];
3651                         buffer_FragColorbgra8[x*4+1] = d[1];
3652                         buffer_FragColorbgra8[x*4+2] = d[2];
3653                         buffer_FragColorbgra8[x*4+3] = d[3];
3654                 }
3655         }
3656         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3657         {
3658                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3659                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3660                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3661                 Color_Diffuse[3] = 0.0f;
3662                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3663                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3664                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3665                 LightColor[3] = 0.0f;
3666                 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3667                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3668                 for (x = startx;x < endx;x++)
3669                 {
3670                         z = buffer_z[x];
3671                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3672                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3673                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3674                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3675                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3676                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3677                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3678                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3679
3680                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3681                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3682                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3683                         DPSOFTRAST_Vector3Normalize(lightnormal);
3684
3685                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3686                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3687                         {
3688                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3689                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3690                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3691                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3692                         }
3693                         else
3694                         {
3695                                 d[0] = (int)(                                                + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3696                                 d[1] = (int)(                                                + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3697                                 d[2] = (int)(                                                + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3698                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3699                         }
3700                         buffer_FragColorbgra8[x*4+0] = d[0];
3701                         buffer_FragColorbgra8[x*4+1] = d[1];
3702                         buffer_FragColorbgra8[x*4+2] = d[2];
3703                         buffer_FragColorbgra8[x*4+3] = d[3];
3704                 }
3705         }
3706         else
3707         {
3708                 for (x = startx;x < endx;x++)
3709                 {
3710                         z = buffer_z[x];
3711                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3712                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3713                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3714                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3715
3716                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3717                         {
3718                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3719                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3720                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3721                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3722                         }
3723                         else
3724                         {
3725                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3726                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3727                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3728                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3729                         }
3730                         buffer_FragColorbgra8[x*4+0] = d[0];
3731                         buffer_FragColorbgra8[x*4+1] = d[1];
3732                         buffer_FragColorbgra8[x*4+2] = d[2];
3733                         buffer_FragColorbgra8[x*4+3] = d[3];
3734                 }
3735         }
3736         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3737 }
3738
3739
3740
3741 void DPSOFTRAST_VertexShader_LightSource(void)
3742 {
3743         int i;
3744         int numvertices = dpsoftrast.numvertices;
3745         float LightPosition[4];
3746         float LightVector[4];
3747         float LightVectorModelSpace[4];
3748         float EyePosition[4];
3749         float EyeVectorModelSpace[4];
3750         float EyeVector[4];
3751         float position[4];
3752         float svector[4];
3753         float tvector[4];
3754         float normal[4];
3755         LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
3756         LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
3757         LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
3758         LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
3759         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3760         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3761         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3762         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3763         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3764         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3765         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3766         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3767         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3768         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3769         for (i = 0;i < numvertices;i++)
3770         {
3771                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3772                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3773                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3774                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3775                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3776                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3777                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3778                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3779                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3780                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3781                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3782                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3783                 LightVectorModelSpace[0] = LightPosition[0] - position[0];
3784                 LightVectorModelSpace[1] = LightPosition[1] - position[1];
3785                 LightVectorModelSpace[2] = LightPosition[2] - position[2];
3786                 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
3787                 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
3788                 LightVector[2] = normal[0]  * LightVectorModelSpace[0] + normal[1]  * LightVectorModelSpace[1] + normal[2]  * LightVectorModelSpace[2];
3789                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3790                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3791                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3792                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3793                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3794                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3795                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3796                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3797                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3798                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3799                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3800                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3801                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3802                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3803         }
3804         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3805         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
3806 }
3807
3808 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3809 {
3810 #ifdef SSE2_PRESENT
3811         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3812         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3813         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3814         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3815         unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3816         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3817         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3818         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3819         int x, startx = span->startx, endx = span->endx;
3820         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3821         float CubeVectordata[4];
3822         float CubeVectorslope[4];
3823         float LightVectordata[4];
3824         float LightVectorslope[4];
3825         float EyeVectordata[4];
3826         float EyeVectorslope[4];
3827         float z;
3828         float diffusetex[4];
3829         float glosstex[4];
3830         float surfacenormal[4];
3831         float lightnormal[4];
3832         float eyenormal[4];
3833         float specularnormal[4];
3834         float diffuse;
3835         float specular;
3836         float SpecularPower;
3837         float CubeVector[4];
3838         float attenuation;
3839         int d[4];
3840         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3841         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3842         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3843         Color_Glow[3] = 0.0f;
3844         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3845         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3846         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3847         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3848         Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3849         Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3850         Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3851         Color_Diffuse[3] = 0.0f;
3852         Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3853         Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3854         Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3855         Color_Specular[3] = 0.0f;
3856         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3857         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3858         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3859         Color_Pants[3] = 0.0f;
3860         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3861         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3862         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3863         Color_Shirt[3] = 0.0f;
3864         LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3865         LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3866         LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3867         LightColor[3] = 0.0f;
3868         SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3869         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3870         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3871         DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3872         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3873         memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
3874         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3875         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3876         {
3877                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3878                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3879         }
3880         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3881                 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
3882         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3883         {
3884                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3885                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3886                 for (x = startx;x < endx;x++)
3887                 {
3888                         z = buffer_z[x];
3889                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3890                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3891                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3892                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3893                         if (attenuation < 0.01f)
3894                                 continue;
3895                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3896                         {
3897                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3898                                 if (attenuation < 0.01f)
3899                                         continue;
3900                         }
3901
3902                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3903                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3904                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3905                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3906                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3907                         {
3908                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3909                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3910                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3911                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3912                         }
3913                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3914                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3915                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3916                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3917                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3918                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3919                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3920                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3921
3922                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3923                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3924                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3925                         DPSOFTRAST_Vector3Normalize(lightnormal);
3926
3927                         eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3928                         eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3929                         eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3930                         DPSOFTRAST_Vector3Normalize(eyenormal);
3931
3932                         specularnormal[0] = lightnormal[0] + eyenormal[0];
3933                         specularnormal[1] = lightnormal[1] + eyenormal[1];
3934                         specularnormal[2] = lightnormal[2] + eyenormal[2];
3935                         DPSOFTRAST_Vector3Normalize(specularnormal);
3936
3937                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3938                         specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3939                         specular = pow(specular, SpecularPower * glosstex[3]);
3940                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3941                         {
3942                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3943                                 attenuation *= (1.0f / 255.0f);
3944                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3945                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3946                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3947                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
3948                         }
3949                         else
3950                         {
3951                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
3952                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
3953                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
3954                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
3955                         }
3956                         buffer_FragColorbgra8[x*4+0] = d[0];
3957                         buffer_FragColorbgra8[x*4+1] = d[1];
3958                         buffer_FragColorbgra8[x*4+2] = d[2];
3959                         buffer_FragColorbgra8[x*4+3] = d[3];
3960                 }
3961         }
3962         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3963         {
3964                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3965                 for (x = startx;x < endx;x++)
3966                 {
3967                         z = buffer_z[x];
3968                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3969                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3970                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3971                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3972                         if (attenuation < 0.01f)
3973                                 continue;
3974                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3975                         {
3976                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3977                                 if (attenuation < 0.01f)
3978                                         continue;
3979                         }
3980
3981                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3982                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3983                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3984                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3985                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3986                         {
3987                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3988                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3989                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3990                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3991                         }
3992                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3993                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3994                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3995                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3996
3997                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3998                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3999                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4000                         DPSOFTRAST_Vector3Normalize(lightnormal);
4001
4002                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4003                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4004                         {
4005                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4006                                 attenuation *= (1.0f / 255.0f);
4007                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4008                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4009                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4010                                 d[3] = (int)( diffusetex[3]                                                                                                                   );if (d[3] > 255) d[3] = 255;
4011                         }
4012                         else
4013                         {
4014                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4015                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4016                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4017                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4018                         }
4019                         buffer_FragColorbgra8[x*4+0] = d[0];
4020                         buffer_FragColorbgra8[x*4+1] = d[1];
4021                         buffer_FragColorbgra8[x*4+2] = d[2];
4022                         buffer_FragColorbgra8[x*4+3] = d[3];
4023                 }
4024         }
4025         else
4026         {
4027                 for (x = startx;x < endx;x++)
4028                 {
4029                         z = buffer_z[x];
4030                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4031                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4032                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4033                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4034                         if (attenuation < 0.01f)
4035                                 continue;
4036                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4037                         {
4038                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4039                                 if (attenuation < 0.01f)
4040                                         continue;
4041                         }
4042
4043                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4044                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4045                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4046                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4047                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4048                         {
4049                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4050                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4051                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4052                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4053                         }
4054                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4055                         {
4056                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4057                                 attenuation *= (1.0f / 255.0f);
4058                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4059                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4060                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4061                                 d[3] = (int)( diffusetex[3]                                                                                      );if (d[3] > 255) d[3] = 255;
4062                         }
4063                         else
4064                         {
4065                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4066                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4067                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4068                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4069                         }
4070                         buffer_FragColorbgra8[x*4+0] = d[0];
4071                         buffer_FragColorbgra8[x*4+1] = d[1];
4072                         buffer_FragColorbgra8[x*4+2] = d[2];
4073                         buffer_FragColorbgra8[x*4+3] = d[3];
4074                 }
4075         }
4076         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4077 #endif
4078 }
4079
4080
4081
4082 void DPSOFTRAST_VertexShader_Refraction(void)
4083 {
4084         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4085 }
4086
4087 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4088 {
4089         // TODO: IMPLEMENT
4090         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4091         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4092         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4093         memset(buffer_FragColorbgra8, 0, span->length*4);
4094         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4095 }
4096
4097
4098
4099 void DPSOFTRAST_VertexShader_Water(void)
4100 {
4101         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4102 }
4103
4104
4105 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4106 {
4107         // TODO: IMPLEMENT
4108         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4109         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4110         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4111         memset(buffer_FragColorbgra8, 0, span->length*4);
4112         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4113 }
4114
4115
4116
4117 void DPSOFTRAST_VertexShader_ShowDepth(void)
4118 {
4119         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4120 }
4121
4122 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4123 {
4124         // TODO: IMPLEMENT
4125         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4126         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4127         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4128         memset(buffer_FragColorbgra8, 0, span->length*4);
4129         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4130 }
4131
4132
4133
4134 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4135 {
4136         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4137 }
4138
4139 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4140 {
4141         // TODO: IMPLEMENT
4142         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4143         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4144         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4145         memset(buffer_FragColorbgra8, 0, span->length*4);
4146         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4147 }
4148
4149
4150
4151 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4152 {
4153         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4154 }
4155
4156 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4157 {
4158         // TODO: IMPLEMENT
4159         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4160         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4161         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4162         memset(buffer_FragColorbgra8, 0, span->length*4);
4163         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4164 }
4165
4166
4167
4168 typedef struct DPSOFTRAST_ShaderModeInfo_s
4169 {
4170         int lodarrayindex;
4171         void (*Vertex)(void);
4172         void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4173         unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4174         unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4175 }
4176 DPSOFTRAST_ShaderModeInfo;
4177
4178 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4179 {
4180         {2, DPSOFTRAST_VertexShader_Generic,                        DPSOFTRAST_PixelShader_Generic,                        {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4181         {2, DPSOFTRAST_VertexShader_PostProcess,                    DPSOFTRAST_PixelShader_PostProcess,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4182         {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow,                DPSOFTRAST_PixelShader_Depth_Or_Shadow,                {~0}, {~0}},
4183         {2, DPSOFTRAST_VertexShader_FlatColor,                      DPSOFTRAST_PixelShader_FlatColor,                      {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4184         {2, DPSOFTRAST_VertexShader_VertexColor,                    DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4185         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4186         {2, DPSOFTRAST_VertexShader_FakeLight,                      DPSOFTRAST_PixelShader_FakeLight,                      {~0}, {~0}},
4187         {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace,   {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4188         {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4189         {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,                 {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4190         {2, DPSOFTRAST_VertexShader_LightSource,                    DPSOFTRAST_PixelShader_LightSource,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4191         {2, DPSOFTRAST_VertexShader_Refraction,                     DPSOFTRAST_PixelShader_Refraction,                     {~0}},
4192         {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                          {~0}},
4193         {2, DPSOFTRAST_VertexShader_ShowDepth,                      DPSOFTRAST_PixelShader_ShowDepth,                      {~0}},
4194         {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,               {~0}},
4195         {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}}
4196 };
4197
4198 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4199 {
4200         int i;
4201         int x;
4202         int startx;
4203         int endx;
4204 //      unsigned int c;
4205 //      unsigned int *colorpixel;
4206         unsigned int *depthpixel;
4207         float w;
4208         float wslope;
4209         int depth;
4210         int depthslope;
4211         unsigned int d;
4212         DPSOFTRAST_State_Triangle *triangle;
4213         DPSOFTRAST_State_Span *span;
4214         unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4215         for (i = 0; i < thread->numspans; i++)
4216         {
4217                 span = &thread->spans[i];
4218                 triangle = &thread->triangles[span->triangle];
4219                 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4220                 {
4221                         wslope = triangle->w[0];
4222                         w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4223                         depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4224                         depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4225                         depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4226                         switch(thread->fb_depthfunc)
4227                         {
4228                         default:
4229                         case GL_ALWAYS:  for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = true; break;
4230                         case GL_LESS:    for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4231                         case GL_LEQUAL:  for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4232                         case GL_EQUAL:   for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4233                         case GL_GEQUAL:  for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4234                         case GL_GREATER: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4235                         case GL_NEVER:   for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = false; break;
4236                         }
4237                         //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4238                         //for (x = 0;x < span->length;x++)
4239                         //      colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4240                         // if there is no color buffer, skip pixel shader
4241                         startx = 0;
4242                         endx = span->length;
4243                         while (startx < endx && !pixelmask[startx])
4244                                 startx++;
4245                         while (endx > startx && !pixelmask[endx-1])
4246                                 endx--;
4247                         if (startx >= endx)
4248                                 continue; // no pixels to fill
4249                         span->pixelmask = pixelmask;
4250                         span->startx = startx;
4251                         span->endx = endx;
4252                         // run pixel shader if appropriate
4253                         // do this before running depthmask code, to allow the pixelshader
4254                         // to clear pixelmask values for alpha testing
4255                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4256                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4257                         if (thread->depthmask)
4258                                 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4259                                         if (pixelmask[x])
4260                                                 depthpixel[x] = d;
4261                 }
4262                 else
4263                 {
4264                         // no depth testing means we're just dealing with color...
4265                         // if there is no color buffer, skip pixel shader
4266                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4267                         {
4268                                 memset(pixelmask, 1, span->length);
4269                                 span->pixelmask = pixelmask;
4270                                 span->startx = 0;
4271                                 span->endx = span->length;
4272                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4273                         }
4274                 }
4275         }
4276         thread->numspans = 0;
4277 }
4278
4279 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4280
4281 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4282 {
4283 #ifdef SSE2_PRESENT
4284         int cullface = thread->cullface;
4285         int width = dpsoftrast.fb_width;
4286         int miny1 = thread->miny1;
4287         int maxy1 = thread->maxy1;
4288         int miny2 = thread->miny2;
4289         int maxy2 = thread->maxy2;
4290         __m128i fbmin, fbmax;
4291         __m128 viewportcenter, viewportscale;
4292         int firstvertex = command->firstvertex;
4293         int numvertices = command->numvertices;
4294         int numtriangles = command->numtriangles;
4295         const int *element3i = command->element3i;
4296         const unsigned short *element3s = command->element3s;
4297         int clipped = command->clipped;
4298         int i;
4299         int j;
4300         int k;
4301         int y;
4302         int e[3];
4303         __m128i screeny;
4304         int starty, endy, bandy;
4305         int numpoints;
4306         int clipcase;
4307         float clipdist[4];
4308         __m128 triangleedge1, triangleedge2, trianglenormal;
4309         __m128 clipfrac[3];
4310         __m128 screen[4];
4311         DPSOFTRAST_State_Triangle *triangle;
4312         DPSOFTRAST_Texture *texture;
4313         if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4314         {
4315                 if (!ATOMIC_DECREMENT(command->refcount))
4316                 {
4317                         if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4318                                 MM_FREE(command->arrays);
4319                 }
4320                 return;
4321         }
4322         DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4323         fbmin = _mm_setr_epi16(0, miny1, 0, miny1, 0, miny1, 0, miny1);
4324         fbmax = _mm_sub_epi16(_mm_setr_epi16(width, maxy2, width, maxy2, width, maxy2, width, maxy2), _mm_set1_epi16(1));
4325         viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4326         viewportscale = _mm_load_ps(thread->fb_viewportscale);
4327         screen[3] = _mm_setzero_ps();
4328         clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4329         for (i = 0;i < numtriangles;i++)
4330         {
4331                 const float *screencoord4f = command->arrays;
4332                 const float *arrays = screencoord4f + numvertices*4;
4333
4334                 // generate the 3 edges of this triangle
4335                 // generate spans for the triangle - switch based on left split or right split classification of triangle
4336                 if (element3s)
4337                 {
4338                         e[0] = element3s[i*3+0] - firstvertex;
4339                         e[1] = element3s[i*3+1] - firstvertex;
4340                         e[2] = element3s[i*3+2] - firstvertex;
4341                 }
4342                 else if (element3i)
4343                 {
4344                         e[0] = element3i[i*3+0] - firstvertex;
4345                         e[1] = element3i[i*3+1] - firstvertex;
4346                         e[2] = element3i[i*3+2] - firstvertex;
4347                 }
4348                 else
4349                 {
4350                         e[0] = i*3+0;
4351                         e[1] = i*3+1;
4352                         e[2] = i*3+2;
4353                 }
4354
4355 #define SKIPBACKFACE \
4356                 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4357                 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4358                 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4359                 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4360                                                                         _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4361                 switch(cullface) \
4362                 { \
4363                 case GL_BACK: \
4364                         if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4365                                 continue; \
4366                         break; \
4367                 case GL_FRONT: \
4368                         if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4369                                 continue; \
4370                         break; \
4371                 }
4372
4373 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4374                         clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4375                         { \
4376                                 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4377                                 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4378                         }
4379 #define CLIPPEDVERTEXCOPY(k,p1) \
4380                         screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4381
4382 #define GENATTRIBCOPY(attrib, p1) \
4383                 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4384 #define GENATTRIBLERP(attrib, p1, p2) \
4385                 { \
4386                         __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4387                         attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4388                 }
4389 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4390                 switch(clipcase) \
4391                 { \
4392                 default: \
4393                 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4394                 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4395                 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4396                 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4397                 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4398                 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4399                 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4400                 }
4401
4402                 if (! clipped)
4403                         goto notclipped;
4404
4405                 // calculate distance from nearplane
4406                 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4407                 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4408                 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4409                 if (clipdist[0] >= 0.0f)
4410                 {
4411                         if (clipdist[1] >= 0.0f)
4412                         {
4413                                 if (clipdist[2] >= 0.0f)
4414                                 {
4415                                 notclipped:
4416                                         // triangle is entirely in front of nearplane
4417                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4418                                         SKIPBACKFACE;
4419                                         numpoints = 3;
4420                                         clipcase = 0;
4421                                 }
4422                                 else
4423                                 {
4424                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4425                                         SKIPBACKFACE;
4426                                         numpoints = 4;
4427                                         clipcase = 1;
4428                                 }
4429                         }
4430                         else
4431                         {
4432                                 if (clipdist[2] >= 0.0f)
4433                                 {
4434                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4435                                         SKIPBACKFACE;
4436                                         numpoints = 4;
4437                                         clipcase = 2;
4438                                 }
4439                                 else
4440                                 {
4441                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4442                                         SKIPBACKFACE;
4443                                         numpoints = 3;
4444                                         clipcase = 3;
4445                                 }
4446                         }
4447                 }
4448                 else if (clipdist[1] >= 0.0f)
4449                 {
4450                         if (clipdist[2] >= 0.0f)
4451                         {
4452                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4453                                 SKIPBACKFACE;
4454                                 numpoints = 4;
4455                                 clipcase = 4;
4456                         }
4457                         else
4458                         {
4459                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4460                                 SKIPBACKFACE;
4461                                 numpoints = 3;
4462                                 clipcase = 5;
4463                         }
4464                 }
4465                 else if (clipdist[2] >= 0.0f)
4466                 {
4467                         CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4468                         SKIPBACKFACE;
4469                         numpoints = 3;
4470                         clipcase = 6;
4471                 }
4472                 else continue; // triangle is entirely behind nearplane
4473
4474                 {
4475                         // calculate integer y coords for triangle points
4476                         __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4477                                         screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4478                                         screenmin = _mm_min_epi16(screeni, screenir),
4479                                         screenmax = _mm_max_epi16(screeni, screenir);
4480                         screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4481                         screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4482                         screenmin = _mm_max_epi16(screenmin, fbmin);
4483                         screenmax = _mm_min_epi16(screenmax, fbmax);
4484                         // skip offscreen triangles
4485                         if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4486                                 continue;
4487                         starty = _mm_extract_epi16(screenmin, 1);
4488                         endy = _mm_extract_epi16(screenmax, 1)+1;
4489                         if (starty >= maxy1 && endy <= miny2)
4490                                 continue;
4491                         screeny = _mm_srai_epi32(screeni, 16);
4492                 }
4493
4494                 triangle = &thread->triangles[thread->numtriangles];
4495
4496                 // calculate attribute plans for triangle data...
4497                 // okay, this triangle is going to produce spans, we'd better project
4498                 // the interpolants now (this is what gives perspective texturing),
4499                 // this consists of simply multiplying all arrays by the W coord
4500                 // (which is basically 1/Z), which will be undone per-pixel
4501                 // (multiplying by Z again) to get the perspective-correct array
4502                 // values
4503                 {
4504                         __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4505                         __m128 mipedgescale, mipdensity;
4506                         attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4507                         attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4508                         attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4509                         attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4510                         attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4511                         w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4512                         w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4513                         w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4514                         attribedge1 = _mm_sub_ss(w0, w1);
4515                         attribedge2 = _mm_sub_ss(w2, w1);
4516                         attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4517                         attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4518                         x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4519                         y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4520                         attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4521                         _mm_store_ss(&triangle->w[0], attribxslope);
4522                         _mm_store_ss(&triangle->w[1], attribyslope);
4523                         _mm_store_ss(&triangle->w[2], attriborigin);
4524                         mipedgescale = _mm_setzero_ps();
4525                         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4526                         {
4527                                 __m128 attrib0, attrib1, attrib2;
4528                                 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4529                                 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4530                                         break;
4531                                 arrays += numvertices*4;
4532                                 GENATTRIBS(attrib0, attrib1, attrib2);
4533                                 attriborigin = _mm_mul_ps(attrib1, w1);
4534                                 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4535                                 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4536                                 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4537                                 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4538                                 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4539                                 _mm_stream_ps(triangle->attribs[k][0], attribxslope);
4540                                 _mm_stream_ps(triangle->attribs[k][1], attribyslope);
4541                                 _mm_stream_ps(triangle->attribs[k][2], attriborigin);
4542                                 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4543                                 {
4544                                         mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4545                                         mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4546                                         mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4547                                         mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4548                                 }
4549                         }
4550
4551                         memset(triangle->mip, 0, sizeof(triangle->mip));
4552                         for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4553                         {
4554                                 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4555                                 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4556                                         break;
4557                                 texture = thread->texbound[texunit];
4558                                 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4559                                 {
4560                                         mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4561                                         mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4562                                         mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4563                                         mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4564                                         // this will be multiplied in the texturing routine by the texture resolution
4565                                         y = _mm_cvtss_si32(mipdensity);
4566                                         if (y > 0)
4567                                         {
4568                                                 y = (int)(log((float)y)*0.5f/M_LN2);
4569                                                 if (y > texture->mipmaps - 1)
4570                                                         y = texture->mipmaps - 1;
4571                                                 triangle->mip[texunit] = y;
4572                                         }
4573                                 }
4574                         }
4575                 }
4576         
4577                 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
4578                 for (; y < bandy;)
4579                 {
4580                         __m128 xcoords, xslope;
4581                         __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
4582                         int yccmask = _mm_movemask_epi8(ycc);
4583                         int edge0p, edge0n, edge1p, edge1n;
4584                         int nexty;
4585                         if (numpoints == 4)
4586                         {
4587                                 switch(yccmask)
4588                                 {
4589                                 default:
4590                                 case 0xFFFF: /*0000*/ y = endy; continue;
4591                                 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
4592                                 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4593                                 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
4594                                 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
4595                                 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
4596                                 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
4597                                 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
4598                                 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
4599                                 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
4600                                 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
4601                                 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
4602                                 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
4603                                 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4604                                 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
4605                                 case 0x0000: /*1111*/ y++; continue;
4606                                 }
4607                         }
4608                         else
4609                         {
4610                                 switch(yccmask)
4611                                 {
4612                                 default:
4613                                 case 0xFFFF: /*000*/ y = endy; continue;
4614                                 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
4615                                 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4616                                 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
4617                                 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
4618                                 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4619                                 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
4620                                 case 0x0000: /*111*/ y++; continue;
4621                                 }
4622                         }
4623                         ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
4624                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
4625                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
4626                         nexty = _mm_extract_epi16(ycc, 0);
4627                         if (nexty >= bandy) nexty = bandy-1;
4628                         if (_mm_ucomigt_ss(_mm_max_ss(screen[edge0n], screen[edge0p]), _mm_min_ss(screen[edge1n], screen[edge1p])))
4629                         {
4630                                 int tmp = edge0n;
4631                                 edge0n = edge1n;
4632                                 edge1n = tmp;
4633                                 tmp = edge0p;
4634                                 edge0p = edge1p;
4635                                 edge1p = tmp;
4636                         }
4637                         xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
4638                         xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
4639                         xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
4640                                                                 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
4641                         xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
4642                         for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
4643                         {
4644                                 int startx, endx, offset;
4645                                 startx = _mm_cvtss_si32(xcoords);
4646                                 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
4647                                 if (startx < 0) startx = 0;
4648                                 if (endx > dpsoftrast.fb_width) endx = dpsoftrast.fb_width;
4649                                 if (startx >= endx) continue;
4650                                 for (offset = startx; offset < endx;)
4651                                 {
4652                                         DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
4653                                         span->triangle = thread->numtriangles;
4654                                         span->x = offset;
4655                                         span->y = y;
4656                                         span->length = endx - offset;
4657                                         if (span -> length > DPSOFTRAST_DRAW_MAXSPANLENGTH)
4658                                                 span -> length = DPSOFTRAST_DRAW_MAXSPANLENGTH;
4659                                         offset += span->length;
4660                                         if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
4661                                                 DPSOFTRAST_Draw_ProcessSpans(thread);
4662                                 }
4663                         }
4664                 }
4665
4666                 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
4667                 {
4668                         DPSOFTRAST_Draw_ProcessSpans(thread);
4669                         thread->numtriangles = 0;
4670                 }
4671         }
4672
4673         if (!ATOMIC_DECREMENT(command->refcount))
4674         {
4675                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4676                         MM_FREE(command->arrays);
4677         }
4678
4679         if (thread->numspans > 0 || thread->numtriangles > 0)
4680         {
4681                 DPSOFTRAST_Draw_ProcessSpans(thread);
4682                 thread->numtriangles = 0;
4683         }
4684 #endif
4685 }
4686
4687 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4688 {
4689         int i;
4690         int j;
4691         int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
4692         int datasize = 2*numvertices*sizeof(float[4]);
4693         DPSOFTRAST_Command_Draw *command;
4694         unsigned char *data;
4695         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4696         {
4697                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4698                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4699                         break;
4700                 datasize += numvertices*sizeof(float[4]);
4701         }
4702         if (element3s)
4703                 datasize += numtriangles*sizeof(unsigned short[3]);
4704         else if (element3i)
4705                 datasize += numtriangles*sizeof(int[3]);
4706         datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
4707         if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
4708         {
4709                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
4710                 data = (unsigned char *)MM_CALLOC(datasize, 1);
4711         }
4712         else
4713         {
4714                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
4715                 data = (unsigned char *)command + commandsize;
4716         }
4717         command->firstvertex = firstvertex;
4718         command->numvertices = numvertices;
4719         command->numtriangles = numtriangles;
4720         command->arrays = (float *)data;
4721         memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
4722         dpsoftrast.firstvertex = firstvertex;
4723         dpsoftrast.numvertices = numvertices;
4724         dpsoftrast.screencoord4f = (float *)data;
4725         data += numvertices*sizeof(float[4]);
4726         dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
4727         data += numvertices*sizeof(float[4]);
4728         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4729         {
4730                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4731                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4732                         break;
4733                 dpsoftrast.post_array4f[j] = (float *)data;
4734                 data += numvertices*sizeof(float[4]);
4735         }
4736         command->element3i = NULL;
4737         command->element3s = NULL;
4738         if (element3s)
4739         {
4740                 command->element3s = (unsigned short *)data;
4741                 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
4742         }
4743         else if (element3i)
4744         {
4745                 command->element3i = (int *)data;
4746                 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
4747         }
4748         return command;
4749 }
4750
4751 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4752 {
4753         DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
4754         DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
4755         command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
4756         command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
4757         if (command->starty >= command->endy)
4758         {
4759                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4760                         MM_FREE(command->arrays);
4761                 DPSOFTRAST_UndoCommand(command->commandsize);
4762                 return;
4763         }
4764         command->clipped = dpsoftrast.drawclipped;
4765         command->refcount = dpsoftrast.numthreads;
4766
4767 #ifdef USE_THREADS
4768         DPSOFTRAST_Draw_SyncCommands();
4769         {
4770                 int i;
4771                 for (i = 0; i < dpsoftrast.numthreads; i++)
4772                 {
4773                         DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
4774                         if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
4775                                 SDL_CondSignal(thread->drawcond);
4776                 }
4777         }
4778 #else
4779         DPSOFTRAST_Draw_FlushThreads();
4780 #endif
4781 }
4782  
4783 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
4784 {
4785         int commandoffset = thread->commandoffset;
4786         while (commandoffset != endoffset)
4787         {
4788                 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
4789                 switch (command->opcode)
4790                 {
4791 #define INTERPCOMMAND(name) \
4792                 case DPSOFTRAST_OPCODE_##name : \
4793                         DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
4794                         commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
4795                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
4796                                 commandoffset = 0; \
4797                         break;
4798                 INTERPCOMMAND(Viewport)
4799                 INTERPCOMMAND(ClearColor)
4800                 INTERPCOMMAND(ClearDepth)
4801                 INTERPCOMMAND(ColorMask)
4802                 INTERPCOMMAND(DepthTest)
4803                 INTERPCOMMAND(ScissorTest)
4804                 INTERPCOMMAND(Scissor)
4805                 INTERPCOMMAND(BlendFunc)
4806                 INTERPCOMMAND(BlendSubtract)
4807                 INTERPCOMMAND(DepthMask)
4808                 INTERPCOMMAND(DepthFunc)
4809                 INTERPCOMMAND(DepthRange)
4810                 INTERPCOMMAND(PolygonOffset)
4811                 INTERPCOMMAND(CullFace)
4812                 INTERPCOMMAND(AlphaTest)
4813                 INTERPCOMMAND(AlphaFunc)
4814                 INTERPCOMMAND(SetTexture)
4815                 INTERPCOMMAND(SetShader)
4816                 INTERPCOMMAND(Uniform4f)
4817                 INTERPCOMMAND(UniformMatrix4f)
4818                 INTERPCOMMAND(Uniform1i)
4819
4820                 case DPSOFTRAST_OPCODE_Draw:
4821                         DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
4822                         commandoffset += command->commandsize;
4823                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
4824                                 commandoffset = 0;
4825                         thread->commandoffset = commandoffset;
4826                         break;
4827
4828                 case DPSOFTRAST_OPCODE_Reset:
4829                         commandoffset = 0;
4830                         break;
4831                 }
4832         }
4833         thread->commandoffset = commandoffset;
4834 }
4835
4836 #ifdef USE_THREADS
4837 static int DPSOFTRAST_Draw_Thread(void *data)
4838 {
4839         DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
4840         while(thread->index >= 0)
4841         {
4842                 if (thread->commandoffset != dpsoftrast.drawcommand)
4843                 {
4844                         DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);      
4845                 }
4846                 else 
4847                 {
4848                         SDL_LockMutex(thread->drawmutex);
4849                         if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
4850                         {
4851                                 if (thread->waiting) SDL_CondSignal(thread->waitcond);
4852                                 thread->starving = true;
4853                                 SDL_CondWait(thread->drawcond, thread->drawmutex);
4854                                 thread->starving = false;
4855                         }
4856                         SDL_UnlockMutex(thread->drawmutex);
4857                 }
4858         }   
4859         return 0;
4860 }
4861 #endif
4862
4863 static void DPSOFTRAST_Draw_FlushThreads(void)
4864 {
4865         DPSOFTRAST_State_Thread *thread;
4866         int i;
4867         DPSOFTRAST_Draw_SyncCommands();
4868 #ifdef USE_THREADS
4869         for (i = 0; i < dpsoftrast.numthreads; i++)
4870         {
4871                 thread = &dpsoftrast.threads[i];
4872                 if (thread->commandoffset != dpsoftrast.drawcommand)
4873                 {
4874                         SDL_LockMutex(thread->drawmutex);
4875                         if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
4876                                 SDL_CondSignal(thread->drawcond);
4877                         SDL_UnlockMutex(thread->drawmutex);
4878                 }
4879         }
4880 #endif                  
4881         for (i = 0; i < dpsoftrast.numthreads; i++)
4882         {
4883                 thread = &dpsoftrast.threads[i];
4884 #ifdef USE_THREADS
4885                 if (thread->commandoffset != dpsoftrast.drawcommand)
4886                 {
4887                         SDL_LockMutex(thread->drawmutex);
4888                         if (thread->commandoffset != dpsoftrast.drawcommand)
4889                         {
4890                                 thread->waiting = true;
4891                                 SDL_CondWait(thread->waitcond, thread->drawmutex);
4892                                 thread->waiting = false;
4893                         }
4894                         SDL_UnlockMutex(thread->drawmutex);
4895                 }
4896 #else
4897                 if (thread->commandoffset != dpsoftrast.drawcommand)
4898                         DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
4899 #endif
4900         }
4901         dpsoftrast.commandpool.usedcommands = 0;
4902 }
4903
4904 void DPSOFTRAST_Flush(void)
4905 {
4906         DPSOFTRAST_Draw_FlushThreads();
4907 }
4908
4909 void DPSOFTRAST_Finish(void)
4910 {
4911         DPSOFTRAST_Flush();
4912 }
4913
4914 void DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
4915 {
4916         int i;
4917         union
4918         {
4919                 int i;
4920                 unsigned char b[4];
4921         }
4922         u;
4923         u.i = 1;
4924         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
4925         dpsoftrast.bigendian = u.b[3];
4926         dpsoftrast.fb_width = width;
4927         dpsoftrast.fb_height = height;
4928         dpsoftrast.fb_depthpixels = depthpixels;
4929         dpsoftrast.fb_colorpixels[0] = colorpixels;
4930         dpsoftrast.fb_colorpixels[1] = NULL;
4931         dpsoftrast.fb_colorpixels[1] = NULL;
4932         dpsoftrast.fb_colorpixels[1] = NULL;
4933         dpsoftrast.viewport[0] = 0;
4934         dpsoftrast.viewport[1] = 0;
4935         dpsoftrast.viewport[2] = dpsoftrast.fb_width;
4936         dpsoftrast.viewport[3] = dpsoftrast.fb_height;
4937         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
4938         dpsoftrast.texture_firstfree = 1;
4939         dpsoftrast.texture_end = 1;
4940         dpsoftrast.texture_max = 0;
4941         dpsoftrast.color[0] = 1;
4942         dpsoftrast.color[1] = 1;
4943         dpsoftrast.color[2] = 1;
4944         dpsoftrast.color[3] = 1;
4945         dpsoftrast.interlace = bound(0, interlace, 1);
4946 #ifdef USE_THREADS
4947         dpsoftrast.numthreads = bound(1, numthreads, 64);
4948 #else
4949         dpsoftrast.numthreads = 1;
4950 #endif
4951         dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
4952         for (i = 0; i < dpsoftrast.numthreads; i++)
4953         {
4954                 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
4955                 thread->index = i;
4956                 thread->cullface = GL_BACK;
4957                 thread->colormask[1] = 1;
4958                 thread->colormask[2] = 1;
4959                 thread->colormask[3] = 1;
4960                 thread->blendfunc[0] = GL_ONE;
4961                 thread->blendfunc[1] = GL_ZERO;
4962                 thread->depthmask = true;
4963                 thread->depthtest = true;
4964                 thread->depthfunc = GL_LEQUAL;
4965                 thread->scissortest = false;
4966                 thread->alphatest = false;
4967                 thread->alphafunc = GL_GREATER;
4968                 thread->alphavalue = 0.5f;
4969                 thread->viewport[0] = 0;
4970                 thread->viewport[1] = 0;
4971                 thread->viewport[2] = dpsoftrast.fb_width;
4972                 thread->viewport[3] = dpsoftrast.fb_height;
4973                 thread->scissor[0] = 0;
4974                 thread->scissor[1] = 0;
4975                 thread->scissor[2] = dpsoftrast.fb_width;
4976                 thread->scissor[3] = dpsoftrast.fb_height;
4977                 thread->depthrange[0] = 0;
4978                 thread->depthrange[1] = 1;
4979                 thread->polygonoffset[0] = 0;
4980                 thread->polygonoffset[1] = 0;
4981         
4982                 if (dpsoftrast.interlace)
4983                 {
4984                         thread->miny1 = (i*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
4985                         thread->maxy1 = ((i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
4986                         thread->miny2 = ((dpsoftrast.numthreads+i)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
4987                         thread->maxy2 = ((dpsoftrast.numthreads+i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
4988                 }
4989                 else
4990                 {
4991                         thread->miny1 = thread->miny2 = (i*dpsoftrast.fb_height)/dpsoftrast.numthreads;
4992                         thread->maxy1 = thread->maxy2 = ((i+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
4993                 }
4994
4995                 thread->numspans = 0;
4996                 thread->numtriangles = 0;
4997                 thread->commandoffset = 0;
4998                 thread->waiting = false;
4999                 thread->starving = false;
5000 #ifdef USE_THREADS
5001                 thread->waitcond = SDL_CreateCond();
5002                 thread->drawcond = SDL_CreateCond();
5003                 thread->drawmutex = SDL_CreateMutex();
5004 #endif
5005
5006                 thread->validate = -1;
5007                 DPSOFTRAST_Validate(thread, -1);
5008 #ifdef USE_THREADS
5009                 thread->thread = SDL_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5010 #endif
5011         }
5012 }
5013
5014 void DPSOFTRAST_Shutdown(void)
5015 {
5016         int i;
5017 #ifdef USE_THREADS
5018         if (dpsoftrast.numthreads > 0)
5019         {
5020                 DPSOFTRAST_State_Thread *thread;
5021                 for (i = 0; i < dpsoftrast.numthreads; i++)
5022                 {
5023                         thread = &dpsoftrast.threads[i];
5024                         SDL_LockMutex(thread->drawmutex);
5025                         thread->index = -1;
5026                         SDL_CondSignal(thread->drawcond);
5027                         SDL_UnlockMutex(thread->drawmutex);
5028                         SDL_WaitThread(thread->thread, NULL);
5029                         SDL_DestroyCond(thread->waitcond);
5030                         SDL_DestroyCond(thread->drawcond);
5031                         SDL_DestroyMutex(thread->drawmutex);
5032                 }
5033         }
5034 #endif
5035         for (i = 0;i < dpsoftrast.texture_end;i++)
5036                 if (dpsoftrast.texture[i].bytes)
5037                         MM_FREE(dpsoftrast.texture[i].bytes);
5038         if (dpsoftrast.texture)
5039                 free(dpsoftrast.texture);
5040         if (dpsoftrast.threads)
5041                 MM_FREE(dpsoftrast.threads);
5042         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5043 }
5044